1use scirs2_core::ndarray::{Array1, Array2};
7use scirs2_linalg::eigh;
8use std::collections::VecDeque;
9
10use crate::error::{Result, TransformError};
11
12pub trait StreamingTransformer: Send + Sync {
14 fn partial_fit(&mut self, x: &Array2<f64>) -> Result<()>;
16
17 fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>>;
19
20 fn reset(&mut self);
22
23 fn n_samples_seen(&self) -> usize;
25}
26
27pub struct StreamingStandardScaler {
29 mean: Array1<f64>,
31 variance: Array1<f64>,
33 n_samples: usize,
35 with_mean: bool,
37 withstd: bool,
39 epsilon: f64,
41}
42
43impl StreamingStandardScaler {
44 pub fn new(_nfeatures: usize, with_mean: bool, withstd: bool) -> Self {
46 StreamingStandardScaler {
47 mean: Array1::zeros(_nfeatures),
48 variance: Array1::zeros(_nfeatures),
49 n_samples: 0,
50 with_mean,
51 withstd,
52 epsilon: 1e-8,
53 }
54 }
55
56 fn update_statistics(&mut self, x: &Array2<f64>) {
58 let batch_size = x.shape()[0];
59 let nfeatures = x.shape()[1];
60
61 for i in 0..batch_size {
62 self.n_samples += 1;
63 let n = self.n_samples as f64;
64
65 for j in 0..nfeatures {
66 let value = x[[i, j]];
67 let delta = value - self.mean[j];
68 self.mean[j] += delta / n;
69
70 if self.withstd {
71 let delta2 = value - self.mean[j];
72 self.variance[j] += delta * delta2;
73 }
74 }
75 }
76 }
77
78 fn get_std(&self) -> Array1<f64> {
80 if self.n_samples <= 1 {
81 Array1::ones(self.mean.len())
82 } else {
83 self.variance
84 .mapv(|v| (v / (self.n_samples - 1) as f64).sqrt().max(self.epsilon))
85 }
86 }
87}
88
89impl StreamingTransformer for StreamingStandardScaler {
90 fn partial_fit(&mut self, x: &Array2<f64>) -> Result<()> {
91 if x.shape()[1] != self.mean.len() {
92 return Err(TransformError::InvalidInput(format!(
93 "Expected {} features, got {}",
94 self.mean.len(),
95 x.shape()[1]
96 )));
97 }
98
99 self.update_statistics(x);
100 Ok(())
101 }
102
103 fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
104 if x.shape()[1] != self.mean.len() {
105 return Err(TransformError::InvalidInput(format!(
106 "Expected {} features, got {}",
107 self.mean.len(),
108 x.shape()[1]
109 )));
110 }
111
112 let mut result = x.to_owned();
113
114 if self.with_mean {
115 for i in 0..result.shape()[0] {
116 for j in 0..result.shape()[1] {
117 result[[i, j]] -= self.mean[j];
118 }
119 }
120 }
121
122 if self.withstd {
123 let std = self.get_std();
124 for i in 0..result.shape()[0] {
125 for j in 0..result.shape()[1] {
126 result[[i, j]] /= std[j];
127 }
128 }
129 }
130
131 Ok(result)
132 }
133
134 fn reset(&mut self) {
135 self.mean.fill(0.0);
136 self.variance.fill(0.0);
137 self.n_samples = 0;
138 }
139
140 fn n_samples_seen(&self) -> usize {
141 self.n_samples
142 }
143}
144
145pub struct StreamingMinMaxScaler {
147 min: Array1<f64>,
149 max: Array1<f64>,
151 featurerange: (f64, f64),
153 n_samples: usize,
155}
156
157impl StreamingMinMaxScaler {
158 pub fn new(_nfeatures: usize, featurerange: (f64, f64)) -> Self {
160 StreamingMinMaxScaler {
161 min: Array1::from_elem(_nfeatures, f64::INFINITY),
162 max: Array1::from_elem(_nfeatures, f64::NEG_INFINITY),
163 featurerange,
164 n_samples: 0,
165 }
166 }
167
168 fn update_bounds(&mut self, x: &Array2<f64>) {
170 for i in 0..x.shape()[0] {
171 for j in 0..x.shape()[1] {
172 let value = x[[i, j]];
173 self.min[j] = self.min[j].min(value);
174 self.max[j] = self.max[j].max(value);
175 }
176 self.n_samples += 1;
177 }
178 }
179}
180
181impl StreamingTransformer for StreamingMinMaxScaler {
182 fn partial_fit(&mut self, x: &Array2<f64>) -> Result<()> {
183 if x.shape()[1] != self.min.len() {
184 return Err(TransformError::InvalidInput(format!(
185 "Expected {} features, got {}",
186 self.min.len(),
187 x.shape()[1]
188 )));
189 }
190
191 self.update_bounds(x);
192 Ok(())
193 }
194
195 fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
196 if x.shape()[1] != self.min.len() {
197 return Err(TransformError::InvalidInput(format!(
198 "Expected {} features, got {}",
199 self.min.len(),
200 x.shape()[1]
201 )));
202 }
203
204 let mut result = Array2::zeros((x.nrows(), x.ncols()));
205 let (min_val, max_val) = self.featurerange;
206 let scale = max_val - min_val;
207
208 for i in 0..x.shape()[0] {
209 for j in 0..x.shape()[1] {
210 let range = self.max[j] - self.min[j];
211 if range > 1e-10 {
212 result[[i, j]] = (x[[i, j]] - self.min[j]) / range * scale + min_val;
213 } else {
214 result[[i, j]] = (min_val + max_val) / 2.0;
215 }
216 }
217 }
218
219 Ok(result)
220 }
221
222 fn reset(&mut self) {
223 self.min.fill(f64::INFINITY);
224 self.max.fill(f64::NEG_INFINITY);
225 self.n_samples = 0;
226 }
227
228 fn n_samples_seen(&self) -> usize {
229 self.n_samples
230 }
231}
232
233pub struct StreamingQuantileTracker {
235 quantiles: Vec<f64>,
237 p2_states: Vec<Vec<P2State>>,
239 nfeatures: usize,
241}
242
243struct P2State {
245 n: [f64; 5],
247 q: [f64; 5],
249 n_prime: [f64; 5],
251 count: usize,
253 p: f64,
255}
256
257impl P2State {
258 fn new(p: f64) -> Self {
259 P2State {
260 n: [1.0, 2.0, 3.0, 4.0, 5.0],
261 q: [0.0; 5],
262 n_prime: [1.0, 1.0 + 2.0 * p, 1.0 + 4.0 * p, 3.0 + 2.0 * p, 5.0],
263 count: 0,
264 p,
265 }
266 }
267
268 fn update(&mut self, value: f64) {
269 if self.count < 5 {
270 self.q[self.count] = value;
271 self.count += 1;
272
273 if self.count == 5 {
274 self.q.sort_by(|a, b| a.partial_cmp(b).unwrap());
276 }
277 return;
278 }
279
280 let mut k = 0;
282 for i in 1..5 {
283 if value < self.q[i] {
284 k = i - 1;
285 break;
286 }
287 }
288 if value >= self.q[4] {
289 k = 3;
290 }
291
292 for i in (k + 1)..5 {
294 self.n[i] += 1.0;
295 }
296
297 for i in 0..5 {
299 self.n_prime[i] += match i {
300 0 => 0.0,
301 1 => self.p / 2.0,
302 2 => self.p,
303 3 => (1.0 + self.p) / 2.0,
304 4 => 1.0,
305 _ => unreachable!(),
306 };
307 }
308
309 for i in 1..4 {
311 let d = self.n_prime[i] - self.n[i];
312
313 if (d >= 1.0 && self.n[i + 1] - self.n[i] > 1.0)
314 || (d <= -1.0 && self.n[i - 1] - self.n[i] < -1.0)
315 {
316 let d_sign = d.signum();
317
318 let qi = self.parabolic_interpolation(i, d_sign);
320
321 if self.q[i - 1] < qi && qi < self.q[i + 1] {
322 self.q[i] = qi;
323 } else {
324 self.q[i] = self.linear_interpolation(i, d_sign);
326 }
327
328 self.n[i] += d_sign;
329 }
330 }
331 }
332
333 fn parabolic_interpolation(&self, i: usize, d: f64) -> f64 {
334 let qi = self.q[i];
335 let qim1 = self.q[i - 1];
336 let qip1 = self.q[i + 1];
337 let ni = self.n[i];
338 let nim1 = self.n[i - 1];
339 let nip1 = self.n[i + 1];
340
341 qi + d / (nip1 - nim1)
342 * ((ni - nim1 + d) * (qip1 - qi) / (nip1 - ni)
343 + (nip1 - ni - d) * (qi - qim1) / (ni - nim1))
344 }
345
346 fn linear_interpolation(&self, i: usize, d: f64) -> f64 {
347 let j = if d > 0.0 { i + 1 } else { i - 1 };
348 self.q[i] + d * (self.q[j] - self.q[i]) / (self.n[j] - self.n[i])
349 }
350
351 fn quantile(&self) -> f64 {
352 if self.count < 5 {
353 let mut sorted = self.q[..self.count].to_vec();
355 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
356 sorted[sorted.len() / 2]
357 } else {
358 self.q[2] }
360 }
361}
362
363impl StreamingQuantileTracker {
364 pub fn new(nfeatures: usize, quantiles: Vec<f64>) -> Result<Self> {
366 for &q in &quantiles {
368 if !(0.0..=1.0).contains(&q) {
369 return Err(TransformError::InvalidInput(format!(
370 "Quantile {q} must be between 0 and 1"
371 )));
372 }
373 }
374
375 let mut p2_states = Vec::with_capacity(nfeatures);
376 for _ in 0..nfeatures {
377 let feature_states: Vec<P2State> = quantiles.iter().map(|&q| P2State::new(q)).collect();
378 p2_states.push(feature_states);
379 }
380
381 Ok(StreamingQuantileTracker {
382 quantiles,
383 p2_states,
384 nfeatures,
385 })
386 }
387
388 pub fn update(&mut self, x: &Array2<f64>) -> Result<()> {
390 if x.shape()[1] != self.nfeatures {
391 return Err(TransformError::InvalidInput(format!(
392 "Expected {} features, got {}",
393 self.nfeatures,
394 x.shape()[1]
395 )));
396 }
397
398 for i in 0..x.shape()[0] {
399 for j in 0..x.shape()[1] {
400 let value = x[[i, j]];
401 for k in 0..self.quantiles.len() {
402 self.p2_states[j][k].update(value);
403 }
404 }
405 }
406
407 Ok(())
408 }
409
410 pub fn get_quantiles(&self) -> Array2<f64> {
412 let mut result = Array2::zeros((self.nfeatures, self.quantiles.len()));
413
414 for j in 0..self.nfeatures {
415 for k in 0..self.quantiles.len() {
416 result[[j, k]] = self.p2_states[j][k].quantile();
417 }
418 }
419
420 result
421 }
422}
423
424pub struct WindowedStreamingTransformer<T: StreamingTransformer> {
426 transformer: T,
428 window: VecDeque<Array2<f64>>,
430 windowsize: usize,
432 current_size: usize,
434}
435
436impl<T: StreamingTransformer> WindowedStreamingTransformer<T> {
437 pub fn new(_transformer: T, windowsize: usize) -> Self {
439 WindowedStreamingTransformer {
440 transformer: _transformer,
441 window: VecDeque::with_capacity(windowsize),
442 windowsize,
443 current_size: 0,
444 }
445 }
446
447 pub fn update(&mut self, x: &Array2<f64>) -> Result<()> {
449 self.window.push_back(x.to_owned());
451 self.current_size += x.shape()[0];
452
453 while self.current_size > self.windowsize && !self.window.is_empty() {
455 if let Some(old_data) = self.window.pop_front() {
456 self.current_size -= old_data.shape()[0];
457 }
458 }
459
460 self.transformer.reset();
462 for data in &self.window {
463 self.transformer.partial_fit(data)?;
464 }
465
466 Ok(())
467 }
468
469 pub fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
471 self.transformer.transform(x)
472 }
473}
474
475pub struct StreamingPCA {
477 mean: Array1<f64>,
479 components: Option<Array2<f64>>,
481 explained_variance: Option<Array1<f64>>,
483 n_components: usize,
485 nfeatures: usize,
487 n_samples: usize,
489 alpha: f64,
491 min_samples: usize,
493 cov_matrix: Array2<f64>,
495}
496
497impl StreamingPCA {
498 pub fn new(
500 nfeatures: usize,
501 n_components: usize,
502 alpha: f64,
503 min_samples: usize,
504 ) -> Result<Self> {
505 if n_components > nfeatures {
506 return Err(TransformError::InvalidInput(
507 "n_components cannot be larger than nfeatures".to_string(),
508 ));
509 }
510 if alpha <= 0.0 || alpha > 1.0 {
511 return Err(TransformError::InvalidInput(
512 "alpha must be in (0, 1]".to_string(),
513 ));
514 }
515
516 Ok(StreamingPCA {
517 mean: Array1::zeros(nfeatures),
518 components: None,
519 explained_variance: None,
520 n_components,
521 nfeatures,
522 n_samples: 0,
523 alpha,
524 min_samples,
525 cov_matrix: Array2::zeros((nfeatures, nfeatures)),
526 })
527 }
528
529 pub fn update(&mut self, x: &Array2<f64>) -> Result<()> {
531 if x.shape()[1] != self.nfeatures {
532 return Err(TransformError::InvalidInput(format!(
533 "Expected {} features, got {}",
534 self.nfeatures,
535 x.shape()[1]
536 )));
537 }
538
539 let _batch_size = x.shape()[0];
540
541 for sample in x.rows() {
543 self.n_samples += 1;
544 let weight = if self.n_samples == 1 { 1.0 } else { self.alpha };
545
546 for (i, &value) in sample.iter().enumerate() {
547 self.mean[i] = (1.0 - weight) * self.mean[i] + weight * value;
548 }
549 }
550
551 if self.n_samples >= self.min_samples {
553 for sample in x.rows() {
554 let centered = &sample.to_owned() - &self.mean;
555 let outer_product = centered
556 .clone()
557 .insert_axis(scirs2_core::ndarray::Axis(1))
558 .dot(¢ered.insert_axis(scirs2_core::ndarray::Axis(0)));
559
560 let weight = self.alpha;
561 self.cov_matrix = (1.0 - weight) * &self.cov_matrix + weight * outer_product;
562 }
563
564 self.compute_pca()?;
566 }
567
568 Ok(())
569 }
570
571 fn compute_pca(&mut self) -> Result<()> {
572 let (eigenvalues, eigenvectors) = eigh(&self.cov_matrix.view(), None).map_err(|e| {
574 TransformError::ComputationError(format!("Eigendecomposition failed: {e}"))
575 })?;
576
577 let mut eigen_pairs: Vec<(f64, Array1<f64>)> = eigenvalues
579 .iter()
580 .zip(eigenvectors.columns())
581 .map(|(&val, vec)| (val, vec.to_owned()))
582 .collect();
583
584 eigen_pairs.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
585
586 let mut components = Array2::zeros((self.n_components, self.nfeatures));
588 let mut explained_var = Array1::zeros(self.n_components);
589
590 for (i, (eigenval, eigenvec)) in eigen_pairs.iter().take(self.n_components).enumerate() {
591 components.row_mut(i).assign(eigenvec);
592 explained_var[i] = eigenval.max(0.0);
593 }
594
595 self.components = Some(components);
596 self.explained_variance = Some(explained_var);
597 Ok(())
598 }
599
600 pub fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
602 if let Some(ref components) = self.components {
603 if x.shape()[1] != self.nfeatures {
604 return Err(TransformError::InvalidInput(format!(
605 "Expected {} features, got {}",
606 self.nfeatures,
607 x.shape()[1]
608 )));
609 }
610
611 let mut result = Array2::zeros((x.shape()[0], self.n_components));
612
613 for (i, sample) in x.rows().into_iter().enumerate() {
614 let centered = &sample.to_owned() - &self.mean;
615 let transformed = components.dot(¢ered);
616 result.row_mut(i).assign(&transformed);
617 }
618
619 Ok(result)
620 } else {
621 Err(TransformError::TransformationError(
622 "PCA not computed yet, need more samples".to_string(),
623 ))
624 }
625 }
626
627 pub fn explained_variance_ratio(&self) -> Option<Array1<f64>> {
629 self.explained_variance.as_ref().map(|var| {
630 let total_var = var.sum();
631 if total_var > 0.0 {
632 var / total_var
633 } else {
634 Array1::zeros(var.len())
635 }
636 })
637 }
638
639 pub fn reset(&mut self) {
641 self.mean.fill(0.0);
642 self.components = None;
643 self.explained_variance = None;
644 self.n_samples = 0;
645 self.cov_matrix.fill(0.0);
646 }
647}
648
649pub struct StreamingOutlierDetector {
651 means: Array1<f64>,
653 variances: Array1<f64>,
654 n_samples: usize,
656 nfeatures: usize,
658 threshold: f64,
660 method: OutlierMethod,
662}
663
664#[derive(Debug, Clone)]
666pub enum OutlierMethod {
667 ZScore,
669 ModifiedZScore,
671 IsolationScore,
673}
674
675impl StreamingOutlierDetector {
676 pub fn new(nfeatures: usize, threshold: f64, method: OutlierMethod) -> Self {
678 StreamingOutlierDetector {
679 means: Array1::zeros(nfeatures),
680 variances: Array1::zeros(nfeatures),
681 n_samples: 0,
682 nfeatures,
683 threshold,
684 method,
685 }
686 }
687
688 pub fn update(&mut self, x: &Array2<f64>) -> Result<()> {
690 if x.shape()[1] != self.nfeatures {
691 return Err(TransformError::InvalidInput(format!(
692 "Expected {} features, got {}",
693 self.nfeatures,
694 x.shape()[1]
695 )));
696 }
697
698 for sample in x.rows() {
700 self.n_samples += 1;
701 let n = self.n_samples as f64;
702
703 for (i, &value) in sample.iter().enumerate() {
704 let delta = value - self.means[i];
705 self.means[i] += delta / n;
706 let delta2 = value - self.means[i];
707 self.variances[i] += delta * delta2;
708 }
709 }
710
711 Ok(())
712 }
713
714 pub fn detect_outliers(&self, x: &Array2<f64>) -> Result<Array1<bool>> {
716 if x.shape()[1] != self.nfeatures {
717 return Err(TransformError::InvalidInput(format!(
718 "Expected {} features, got {}",
719 self.nfeatures,
720 x.shape()[1]
721 )));
722 }
723
724 if self.n_samples < 2 {
725 return Ok(Array1::from_elem(x.shape()[0], false));
727 }
728
729 let mut outliers = Array1::from_elem(x.shape()[0], false);
730
731 match self.method {
732 OutlierMethod::ZScore => {
733 let stds = self.get_standard_deviations();
734
735 for (i, sample) in x.rows().into_iter().enumerate() {
736 let mut is_outlier = false;
737 for (j, &value) in sample.iter().enumerate() {
738 if stds[j] > 1e-8 {
739 let z_score = (value - self.means[j]).abs() / stds[j];
740 if z_score > self.threshold {
741 is_outlier = true;
742 break;
743 }
744 }
745 }
746 outliers[i] = is_outlier;
747 }
748 }
749 OutlierMethod::ModifiedZScore => {
750 for (i, sample) in x.rows().into_iter().enumerate() {
752 let mut is_outlier = false;
753 for (j, &value) in sample.iter().enumerate() {
754 let mad_estimate =
755 (self.variances[j] / (self.n_samples - 1) as f64).sqrt() * 0.6745;
756 if mad_estimate > 1e-8 {
757 let modified_z = 0.6745 * (value - self.means[j]).abs() / mad_estimate;
758 if modified_z > self.threshold {
759 is_outlier = true;
760 break;
761 }
762 }
763 }
764 outliers[i] = is_outlier;
765 }
766 }
767 OutlierMethod::IsolationScore => {
768 for (i, sample) in x.rows().into_iter().enumerate() {
770 let anomaly_score = self.compute_isolation_score(sample);
771 outliers[i] = anomaly_score > self.threshold;
772 }
773 }
774 }
775
776 Ok(outliers)
777 }
778
779 fn get_standard_deviations(&self) -> Array1<f64> {
780 if self.n_samples <= 1 {
781 Array1::ones(self.nfeatures)
782 } else {
783 self.variances
784 .mapv(|v| (v / (self.n_samples - 1) as f64).sqrt().max(1e-8))
785 }
786 }
787
788 fn compute_isolation_score(&self, sample: scirs2_core::ndarray::ArrayView1<f64>) -> f64 {
790 let mut path_length = 0.0;
791 let mut current_sample = sample.to_owned();
792
793 let max_depth = ((self.n_samples as f64).log2().ceil() as usize).min(20);
795
796 for depth in 0..max_depth {
797 let mut min_split_distance = f64::INFINITY;
798
799 for j in 0..self.nfeatures {
801 let std_dev = (self.variances[j] / (self.n_samples - 1) as f64).sqrt();
802 if std_dev > 1e-8 {
803 let normalized_distance = (current_sample[j] - self.means[j]).abs() / std_dev;
805
806 let split_effectiveness = normalized_distance * (1.0 + depth as f64 * 0.1);
808 min_split_distance = min_split_distance.min(split_effectiveness);
809 }
810 }
811
812 if min_split_distance > 3.0 {
814 path_length += depth as f64 + min_split_distance / 3.0;
815 break;
816 }
817
818 path_length += 1.0;
819
820 for j in 0..self.nfeatures {
822 let adjustment = (current_sample[j] - self.means[j]) * 0.1;
823 current_sample[j] -= adjustment;
824 }
825 }
826
827 let expected_path_length = if self.n_samples > 2 {
830 2.0 * ((self.n_samples - 1) as f64).ln() + (std::f64::consts::E * 0.57721566)
831 - 2.0 * (self.n_samples - 1) as f64 / self.n_samples as f64
832 } else {
833 1.0
834 };
835
836 2.0_f64.powf(-path_length / expected_path_length)
838 }
839
840 pub fn anomaly_scores(&self, x: &Array2<f64>) -> Result<Array1<f64>> {
842 if x.shape()[1] != self.nfeatures {
843 return Err(TransformError::InvalidInput(format!(
844 "Expected {} features, got {}",
845 self.nfeatures,
846 x.shape()[1]
847 )));
848 }
849
850 if self.n_samples < 2 {
851 return Ok(Array1::zeros(x.shape()[0]));
852 }
853
854 let mut scores = Array1::zeros(x.shape()[0]);
855 let stds = self.get_standard_deviations();
856
857 for (i, sample) in x.rows().into_iter().enumerate() {
858 let mut score = 0.0;
859 for (j, &value) in sample.iter().enumerate() {
860 if stds[j] > 1e-8 {
861 let z_score = (value - self.means[j]).abs() / stds[j];
862 score += z_score;
863 }
864 }
865 scores[i] = score / self.nfeatures as f64;
866 }
867
868 Ok(scores)
869 }
870
871 pub fn reset(&mut self) {
873 self.means.fill(0.0);
874 self.variances.fill(0.0);
875 self.n_samples = 0;
876 }
877}
878
879pub struct StreamingFeatureSelector {
881 variances: Array1<f64>,
883 means: Array1<f64>,
885 correlations: Array2<f64>,
887 n_samples: usize,
889 nfeatures: usize,
891 variance_threshold: f64,
893 correlationthreshold: f64,
895 selected_features: Option<Vec<usize>>,
897}
898
899impl StreamingFeatureSelector {
900 pub fn new(nfeatures: usize, variance_threshold: f64, correlationthreshold: f64) -> Self {
902 StreamingFeatureSelector {
903 variances: Array1::zeros(nfeatures),
904 means: Array1::zeros(nfeatures),
905 correlations: Array2::zeros((nfeatures, nfeatures)),
906 n_samples: 0,
907 nfeatures,
908 variance_threshold,
909 correlationthreshold,
910 selected_features: None,
911 }
912 }
913
914 pub fn update(&mut self, x: &Array2<f64>) -> Result<()> {
916 if x.shape()[1] != self.nfeatures {
917 return Err(TransformError::InvalidInput(format!(
918 "Expected {} features, got {}",
919 self.nfeatures,
920 x.shape()[1]
921 )));
922 }
923
924 for sample in x.rows() {
926 self.n_samples += 1;
927 let n = self.n_samples as f64;
928
929 for (i, &value) in sample.iter().enumerate() {
931 let delta = value - self.means[i];
932 self.means[i] += delta / n;
933 let delta2 = value - self.means[i];
934 self.variances[i] += delta * delta2;
935 }
936
937 if self.n_samples > 1 {
939 for i in 0..self.nfeatures {
940 for j in (i + 1)..self.nfeatures {
941 let val_i = sample[i] - self.means[i];
942 let val_j = sample[j] - self.means[j];
943 let covar_update = val_i * val_j / (n - 1.0);
944 self.correlations[[i, j]] =
945 (self.correlations[[i, j]] * (n - 2.0) + covar_update) / (n - 1.0);
946 }
947 }
948 }
949 }
950
951 if self.n_samples >= 10 {
953 self.update_selected_features();
955 }
956
957 Ok(())
958 }
959
960 fn update_selected_features(&mut self) {
961 let mut selected = Vec::new();
962 let current_variances = self.get_current_variances();
963
964 for i in 0..self.nfeatures {
966 if current_variances[i] > self.variance_threshold {
967 selected.push(i);
968 }
969 }
970
971 let mut final_selected = Vec::new();
973 for &i in &selected {
974 let mut should_include = true;
975
976 for &j in &final_selected {
977 if i != j {
978 let corr = self.get_correlation(i, j, ¤t_variances);
979 if corr.abs() > self.correlationthreshold {
980 if current_variances[i] <= current_variances[j] {
982 should_include = false;
983 break;
984 }
985 }
986 }
987 }
988
989 if should_include {
990 final_selected.push(i);
991 }
992 }
993
994 self.selected_features = Some(final_selected);
995 }
996
997 fn get_current_variances(&self) -> Array1<f64> {
998 if self.n_samples <= 1 {
999 Array1::zeros(self.nfeatures)
1000 } else {
1001 self.variances.mapv(|v| v / (self.n_samples - 1) as f64)
1002 }
1003 }
1004
1005 fn get_correlation(&self, i: usize, j: usize, variances: &Array1<f64>) -> f64 {
1006 let var_i = variances[i];
1007 let var_j = variances[j];
1008
1009 if var_i > 1e-8 && var_j > 1e-8 {
1010 let idx = if i < j { (i, j) } else { (j, i) };
1011 self.correlations[idx] / (var_i * var_j).sqrt()
1012 } else {
1013 0.0
1014 }
1015 }
1016
1017 pub fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
1019 if let Some(ref selected) = self.selected_features {
1020 if x.shape()[1] != self.nfeatures {
1021 return Err(TransformError::InvalidInput(format!(
1022 "Expected {} features, got {}",
1023 self.nfeatures,
1024 x.shape()[1]
1025 )));
1026 }
1027
1028 if selected.is_empty() {
1029 return Ok(Array2::zeros((x.shape()[0], 0)));
1030 }
1031
1032 let mut result = Array2::zeros((x.shape()[0], selected.len()));
1033
1034 for (row_idx, sample) in x.rows().into_iter().enumerate() {
1035 for (col_idx, &feature_idx) in selected.iter().enumerate() {
1036 result[[row_idx, col_idx]] = sample[feature_idx];
1037 }
1038 }
1039
1040 Ok(result)
1041 } else {
1042 Ok(x.to_owned())
1044 }
1045 }
1046
1047 pub fn get_selected_features(&self) -> Option<&Vec<usize>> {
1049 self.selected_features.as_ref()
1050 }
1051
1052 pub fn n_features_selected(&self) -> usize {
1054 self.selected_features
1055 .as_ref()
1056 .map_or(self.nfeatures, |s| s.len())
1057 }
1058
1059 pub fn reset(&mut self) {
1061 self.variances.fill(0.0);
1062 self.means.fill(0.0);
1063 self.correlations.fill(0.0);
1064 self.n_samples = 0;
1065 self.selected_features = None;
1066 }
1067}