1use crate::error::{CoreError, CoreResult, ErrorContext};
29use ::ndarray::{Array1, Array2, Axis};
30use num_traits::{Float, FromPrimitive, NumCast, Zero};
31use std::collections::HashMap;
32use std::fmt::{Debug, Display};
33use std::hash::Hash;
34
35#[derive(Debug, Clone)]
55pub struct StandardScaler<F: Float> {
56 pub mean: Option<Array1<F>>,
58 pub std_dev: Option<Array1<F>>,
60 pub with_mean: bool,
62 pub with_std: bool,
64}
65
66impl<F: Float + FromPrimitive + Debug + Display + std::iter::Sum> StandardScaler<F> {
67 #[must_use]
69 pub fn new() -> Self {
70 Self {
71 mean: None,
72 std_dev: None,
73 with_mean: true,
74 with_std: true,
75 }
76 }
77
78 #[must_use]
80 pub fn with_options(with_mean: bool, with_std: bool) -> Self {
81 Self {
82 mean: None,
83 std_dev: None,
84 with_mean,
85 with_std,
86 }
87 }
88
89 pub fn fit(&mut self, data: &Array2<F>) -> CoreResult<()> {
91 let n_samples = data.nrows();
92 if n_samples == 0 {
93 return Err(CoreError::ValueError(ErrorContext::new(
94 "Cannot fit StandardScaler on empty data",
95 )));
96 }
97 let n_f = F::from_usize(n_samples).ok_or_else(|| {
98 CoreError::ValueError(ErrorContext::new("Failed to convert n_samples to float"))
99 })?;
100 let n_cols = data.ncols();
101 let mut mean_arr = Array1::<F>::zeros(n_cols);
102 let mut std_arr = Array1::<F>::zeros(n_cols);
103
104 for j in 0..n_cols {
105 let col = data.column(j);
106 let sum: F = col.iter().copied().sum();
107 let m = sum / n_f;
108 mean_arr[j] = m;
109
110 let var_sum: F = col.iter().map(|&x| (x - m) * (x - m)).sum();
111 let var = var_sum / n_f;
112 std_arr[j] = var.sqrt();
113 }
114
115 self.mean = Some(mean_arr);
116 self.std_dev = Some(std_arr);
117 Ok(())
118 }
119
120 pub fn transform(&self, data: &Array2<F>) -> CoreResult<Array2<F>> {
122 let mean = self.mean.as_ref().ok_or_else(|| {
123 CoreError::InvalidState(ErrorContext::new("StandardScaler not fitted"))
124 })?;
125 let std_dev = self.std_dev.as_ref().ok_or_else(|| {
126 CoreError::InvalidState(ErrorContext::new("StandardScaler not fitted"))
127 })?;
128 if data.ncols() != mean.len() {
129 return Err(CoreError::DimensionError(ErrorContext::new(format!(
130 "Expected {} features, got {}",
131 mean.len(),
132 data.ncols()
133 ))));
134 }
135 let mut result = data.clone();
136 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
137 for j in 0..data.ncols() {
138 for i in 0..data.nrows() {
139 let mut val = result[[i, j]];
140 if self.with_mean {
141 val = val - mean[j];
142 }
143 if self.with_std {
144 let s = if std_dev[j] < eps {
145 F::one()
146 } else {
147 std_dev[j]
148 };
149 val = val / s;
150 }
151 result[[i, j]] = val;
152 }
153 }
154 Ok(result)
155 }
156
157 pub fn fit_transform(&mut self, data: &Array2<F>) -> CoreResult<Array2<F>> {
159 self.fit(data)?;
160 self.transform(data)
161 }
162
163 pub fn inverse_transform(&self, data: &Array2<F>) -> CoreResult<Array2<F>> {
165 let mean = self.mean.as_ref().ok_or_else(|| {
166 CoreError::InvalidState(ErrorContext::new("StandardScaler not fitted"))
167 })?;
168 let std_dev = self.std_dev.as_ref().ok_or_else(|| {
169 CoreError::InvalidState(ErrorContext::new("StandardScaler not fitted"))
170 })?;
171 if data.ncols() != mean.len() {
172 return Err(CoreError::DimensionError(ErrorContext::new(format!(
173 "Expected {} features, got {}",
174 mean.len(),
175 data.ncols()
176 ))));
177 }
178 let mut result = data.clone();
179 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
180 for j in 0..data.ncols() {
181 for i in 0..data.nrows() {
182 let mut val = result[[i, j]];
183 if self.with_std {
184 let s = if std_dev[j] < eps {
185 F::one()
186 } else {
187 std_dev[j]
188 };
189 val = val * s;
190 }
191 if self.with_mean {
192 val = val + mean[j];
193 }
194 result[[i, j]] = val;
195 }
196 }
197 Ok(result)
198 }
199}
200
201impl<F: Float + FromPrimitive + Debug + Display + std::iter::Sum> Default for StandardScaler<F> {
202 fn default() -> Self {
203 Self::new()
204 }
205}
206
207#[derive(Debug, Clone)]
227pub struct MinMaxScaler<F: Float> {
228 pub data_min: Option<Array1<F>>,
230 pub data_max: Option<Array1<F>>,
232 pub feature_min: F,
234 pub feature_max: F,
236}
237
238impl<F: Float + FromPrimitive + Debug + Display> MinMaxScaler<F> {
239 #[must_use]
241 pub fn new(feature_min: F, feature_max: F) -> Self {
242 Self {
243 data_min: None,
244 data_max: None,
245 feature_min,
246 feature_max,
247 }
248 }
249
250 #[must_use]
252 pub fn unit_range() -> Self {
253 Self::new(F::zero(), F::one())
254 }
255
256 pub fn fit(&mut self, data: &Array2<F>) -> CoreResult<()> {
258 if data.nrows() == 0 {
259 return Err(CoreError::ValueError(ErrorContext::new(
260 "Cannot fit MinMaxScaler on empty data",
261 )));
262 }
263 let n_cols = data.ncols();
264 let mut mins = Array1::<F>::zeros(n_cols);
265 let mut maxs = Array1::<F>::zeros(n_cols);
266 for j in 0..n_cols {
267 let col = data.column(j);
268 let mut col_min = F::infinity();
269 let mut col_max = F::neg_infinity();
270 for &v in col.iter() {
271 if v < col_min {
272 col_min = v;
273 }
274 if v > col_max {
275 col_max = v;
276 }
277 }
278 mins[j] = col_min;
279 maxs[j] = col_max;
280 }
281 self.data_min = Some(mins);
282 self.data_max = Some(maxs);
283 Ok(())
284 }
285
286 pub fn transform(&self, data: &Array2<F>) -> CoreResult<Array2<F>> {
288 let d_min = self
289 .data_min
290 .as_ref()
291 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("MinMaxScaler not fitted")))?;
292 let d_max = self
293 .data_max
294 .as_ref()
295 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("MinMaxScaler not fitted")))?;
296 if data.ncols() != d_min.len() {
297 return Err(CoreError::DimensionError(ErrorContext::new(format!(
298 "Expected {} features, got {}",
299 d_min.len(),
300 data.ncols()
301 ))));
302 }
303 let range = self.feature_max - self.feature_min;
304 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
305 let mut result = data.clone();
306 for j in 0..data.ncols() {
307 let data_range = d_max[j] - d_min[j];
308 let scale = if data_range.abs() < eps {
309 F::zero()
310 } else {
311 range / data_range
312 };
313 for i in 0..data.nrows() {
314 result[[i, j]] = (result[[i, j]] - d_min[j]) * scale + self.feature_min;
315 }
316 }
317 Ok(result)
318 }
319
320 pub fn fit_transform(&mut self, data: &Array2<F>) -> CoreResult<Array2<F>> {
322 self.fit(data)?;
323 self.transform(data)
324 }
325
326 pub fn inverse_transform(&self, data: &Array2<F>) -> CoreResult<Array2<F>> {
328 let d_min = self
329 .data_min
330 .as_ref()
331 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("MinMaxScaler not fitted")))?;
332 let d_max = self
333 .data_max
334 .as_ref()
335 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("MinMaxScaler not fitted")))?;
336 if data.ncols() != d_min.len() {
337 return Err(CoreError::DimensionError(ErrorContext::new(format!(
338 "Expected {} features, got {}",
339 d_min.len(),
340 data.ncols()
341 ))));
342 }
343 let range = self.feature_max - self.feature_min;
344 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
345 let mut result = data.clone();
346 for j in 0..data.ncols() {
347 let data_range = d_max[j] - d_min[j];
348 let scale = if range.abs() < eps {
349 F::zero()
350 } else {
351 data_range / range
352 };
353 for i in 0..data.nrows() {
354 result[[i, j]] = (result[[i, j]] - self.feature_min) * scale + d_min[j];
355 }
356 }
357 Ok(result)
358 }
359}
360
361#[derive(Debug, Clone)]
372pub struct RobustScaler<F: Float> {
373 pub median: Option<Array1<F>>,
375 pub iqr: Option<Array1<F>>,
377 pub with_centering: bool,
379 pub with_scaling: bool,
381}
382
383impl<F: Float + FromPrimitive + Debug + Display> RobustScaler<F> {
384 #[must_use]
386 pub fn new() -> Self {
387 Self {
388 median: None,
389 iqr: None,
390 with_centering: true,
391 with_scaling: true,
392 }
393 }
394
395 #[must_use]
397 pub fn with_options(with_centering: bool, with_scaling: bool) -> Self {
398 Self {
399 median: None,
400 iqr: None,
401 with_centering,
402 with_scaling,
403 }
404 }
405
406 pub fn fit(&mut self, data: &Array2<F>) -> CoreResult<()> {
408 if data.nrows() == 0 {
409 return Err(CoreError::ValueError(ErrorContext::new(
410 "Cannot fit RobustScaler on empty data",
411 )));
412 }
413 let n_cols = data.ncols();
414 let mut median_arr = Array1::<F>::zeros(n_cols);
415 let mut iqr_arr = Array1::<F>::zeros(n_cols);
416 for j in 0..n_cols {
417 let mut col_vals: Vec<F> = data.column(j).iter().copied().collect();
418 col_vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
419 let n = col_vals.len();
420 median_arr[j] = compute_quantile(&col_vals, F::from_f64(0.5).unwrap_or_else(F::zero));
421 let q1 = compute_quantile(&col_vals, F::from_f64(0.25).unwrap_or_else(F::zero));
422 let q3 = compute_quantile(&col_vals, F::from_f64(0.75).unwrap_or_else(F::zero));
423 iqr_arr[j] = q3 - q1;
424 let _ = n; }
426 self.median = Some(median_arr);
427 self.iqr = Some(iqr_arr);
428 Ok(())
429 }
430
431 pub fn transform(&self, data: &Array2<F>) -> CoreResult<Array2<F>> {
433 let med = self
434 .median
435 .as_ref()
436 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("RobustScaler not fitted")))?;
437 let iqr = self
438 .iqr
439 .as_ref()
440 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("RobustScaler not fitted")))?;
441 if data.ncols() != med.len() {
442 return Err(CoreError::DimensionError(ErrorContext::new(format!(
443 "Expected {} features, got {}",
444 med.len(),
445 data.ncols()
446 ))));
447 }
448 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
449 let mut result = data.clone();
450 for j in 0..data.ncols() {
451 for i in 0..data.nrows() {
452 let mut val = result[[i, j]];
453 if self.with_centering {
454 val = val - med[j];
455 }
456 if self.with_scaling {
457 let s = if iqr[j].abs() < eps { F::one() } else { iqr[j] };
458 val = val / s;
459 }
460 result[[i, j]] = val;
461 }
462 }
463 Ok(result)
464 }
465
466 pub fn fit_transform(&mut self, data: &Array2<F>) -> CoreResult<Array2<F>> {
468 self.fit(data)?;
469 self.transform(data)
470 }
471}
472
473impl<F: Float + FromPrimitive + Debug + Display> Default for RobustScaler<F> {
474 fn default() -> Self {
475 Self::new()
476 }
477}
478
479#[derive(Debug, Clone)]
487pub struct MaxAbsScaler<F: Float> {
488 pub max_abs: Option<Array1<F>>,
490}
491
492impl<F: Float + FromPrimitive + Debug + Display> MaxAbsScaler<F> {
493 #[must_use]
495 pub fn new() -> Self {
496 Self { max_abs: None }
497 }
498
499 pub fn fit(&mut self, data: &Array2<F>) -> CoreResult<()> {
501 if data.nrows() == 0 {
502 return Err(CoreError::ValueError(ErrorContext::new(
503 "Cannot fit MaxAbsScaler on empty data",
504 )));
505 }
506 let n_cols = data.ncols();
507 let mut max_abs_arr = Array1::<F>::zeros(n_cols);
508 for j in 0..n_cols {
509 let mut ma = F::zero();
510 for &v in data.column(j).iter() {
511 let av = v.abs();
512 if av > ma {
513 ma = av;
514 }
515 }
516 max_abs_arr[j] = ma;
517 }
518 self.max_abs = Some(max_abs_arr);
519 Ok(())
520 }
521
522 pub fn transform(&self, data: &Array2<F>) -> CoreResult<Array2<F>> {
524 let ma = self
525 .max_abs
526 .as_ref()
527 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("MaxAbsScaler not fitted")))?;
528 if data.ncols() != ma.len() {
529 return Err(CoreError::DimensionError(ErrorContext::new(format!(
530 "Expected {} features, got {}",
531 ma.len(),
532 data.ncols()
533 ))));
534 }
535 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
536 let mut result = data.clone();
537 for j in 0..data.ncols() {
538 let s = if ma[j].abs() < eps { F::one() } else { ma[j] };
539 for i in 0..data.nrows() {
540 result[[i, j]] = result[[i, j]] / s;
541 }
542 }
543 Ok(result)
544 }
545
546 pub fn fit_transform(&mut self, data: &Array2<F>) -> CoreResult<Array2<F>> {
548 self.fit(data)?;
549 self.transform(data)
550 }
551
552 pub fn inverse_transform(&self, data: &Array2<F>) -> CoreResult<Array2<F>> {
554 let ma = self
555 .max_abs
556 .as_ref()
557 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("MaxAbsScaler not fitted")))?;
558 if data.ncols() != ma.len() {
559 return Err(CoreError::DimensionError(ErrorContext::new(format!(
560 "Expected {} features, got {}",
561 ma.len(),
562 data.ncols()
563 ))));
564 }
565 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
566 let mut result = data.clone();
567 for j in 0..data.ncols() {
568 let s = if ma[j].abs() < eps { F::one() } else { ma[j] };
569 for i in 0..data.nrows() {
570 result[[i, j]] = result[[i, j]] * s;
571 }
572 }
573 Ok(result)
574 }
575}
576
577impl<F: Float + FromPrimitive + Debug + Display> Default for MaxAbsScaler<F> {
578 fn default() -> Self {
579 Self::new()
580 }
581}
582
583#[derive(Debug, Clone)]
601pub struct LabelEncoder<L: Eq + Hash + Clone> {
602 pub label_to_int: HashMap<L, usize>,
604 pub int_to_label: Vec<L>,
606}
607
608impl<L: Eq + Hash + Clone + Debug> LabelEncoder<L> {
609 #[must_use]
611 pub fn new() -> Self {
612 Self {
613 label_to_int: HashMap::new(),
614 int_to_label: Vec::new(),
615 }
616 }
617
618 pub fn fit(&mut self, labels: &[L]) {
621 self.label_to_int.clear();
622 self.int_to_label.clear();
623 for label in labels {
624 if !self.label_to_int.contains_key(label) {
625 let idx = self.int_to_label.len();
626 self.label_to_int.insert(label.clone(), idx);
627 self.int_to_label.push(label.clone());
628 }
629 }
630 }
631
632 pub fn transform(&self, labels: &[L]) -> CoreResult<Vec<usize>> {
634 if self.int_to_label.is_empty() {
635 return Err(CoreError::InvalidState(ErrorContext::new(
636 "LabelEncoder not fitted",
637 )));
638 }
639 let mut result = Vec::with_capacity(labels.len());
640 for label in labels {
641 let idx = self.label_to_int.get(label).ok_or_else(|| {
642 CoreError::ValueError(ErrorContext::new(format!(
643 "Unknown label encountered: {:?}",
644 label
645 )))
646 })?;
647 result.push(*idx);
648 }
649 Ok(result)
650 }
651
652 pub fn inverse_transform(&self, indices: &[usize]) -> CoreResult<Vec<L>> {
654 let mut result = Vec::with_capacity(indices.len());
655 for &idx in indices {
656 if idx >= self.int_to_label.len() {
657 return Err(CoreError::IndexError(ErrorContext::new(format!(
658 "Label index {} out of range (max {})",
659 idx,
660 self.int_to_label.len().saturating_sub(1)
661 ))));
662 }
663 result.push(self.int_to_label[idx].clone());
664 }
665 Ok(result)
666 }
667
668 pub fn fit_transform(&mut self, labels: &[L]) -> Vec<usize> {
670 self.fit(labels);
671 labels.iter().map(|l| self.label_to_int[l]).collect()
673 }
674
675 #[must_use]
677 pub fn n_classes(&self) -> usize {
678 self.int_to_label.len()
679 }
680}
681
682impl<L: Eq + Hash + Clone + Debug> Default for LabelEncoder<L> {
683 fn default() -> Self {
684 Self::new()
685 }
686}
687
688#[derive(Debug, Clone)]
708pub struct OneHotEncoder<L: Eq + Hash + Clone> {
709 pub encoders: Vec<LabelEncoder<L>>,
711 pub n_features: usize,
713}
714
715impl<L: Eq + Hash + Clone + Debug> OneHotEncoder<L> {
716 #[must_use]
718 pub fn new() -> Self {
719 Self {
720 encoders: Vec::new(),
721 n_features: 0,
722 }
723 }
724
725 pub fn fit(&mut self, data: &[Vec<L>]) {
727 if data.is_empty() {
728 self.n_features = 0;
729 self.encoders.clear();
730 return;
731 }
732 self.n_features = data[0].len();
733 self.encoders.clear();
734 for j in 0..self.n_features {
735 let mut enc = LabelEncoder::new();
736 let col: Vec<L> = data.iter().map(|row| row[j].clone()).collect();
737 enc.fit(&col);
738 self.encoders.push(enc);
739 }
740 }
741
742 pub fn transform(&self, data: &[Vec<L>]) -> CoreResult<Array2<f64>> {
744 if self.encoders.is_empty() {
745 return Err(CoreError::InvalidState(ErrorContext::new(
746 "OneHotEncoder not fitted",
747 )));
748 }
749 let total_cols: usize = self.encoders.iter().map(|e| e.n_classes()).sum();
750 let n_rows = data.len();
751 let mut result = Array2::<f64>::zeros((n_rows, total_cols));
752 let mut col_offset = 0;
753 for (j, enc) in self.encoders.iter().enumerate() {
754 let col_labels: Vec<L> = data.iter().map(|row| row[j].clone()).collect();
755 let indices = enc.transform(&col_labels)?;
756 for (i, idx) in indices.into_iter().enumerate() {
757 result[[i, col_offset + idx]] = 1.0;
758 }
759 col_offset += enc.n_classes();
760 }
761 Ok(result)
762 }
763
764 pub fn fit_transform(&mut self, data: &[Vec<L>]) -> CoreResult<Array2<f64>> {
766 self.fit(data);
767 self.transform(data)
768 }
769
770 #[must_use]
772 pub fn n_output_features(&self) -> usize {
773 self.encoders.iter().map(|e| e.n_classes()).sum()
774 }
775}
776
777impl<L: Eq + Hash + Clone + Debug> Default for OneHotEncoder<L> {
778 fn default() -> Self {
779 Self::new()
780 }
781}
782
783#[derive(Debug, Clone)]
792pub struct OrdinalEncoder<L: Eq + Hash + Clone> {
793 pub encoders: Vec<LabelEncoder<L>>,
795 pub n_features: usize,
797}
798
799impl<L: Eq + Hash + Clone + Debug> OrdinalEncoder<L> {
800 #[must_use]
802 pub fn new() -> Self {
803 Self {
804 encoders: Vec::new(),
805 n_features: 0,
806 }
807 }
808
809 pub fn fit(&mut self, data: &[Vec<L>]) {
811 if data.is_empty() {
812 self.n_features = 0;
813 self.encoders.clear();
814 return;
815 }
816 self.n_features = data[0].len();
817 self.encoders.clear();
818 for j in 0..self.n_features {
819 let mut enc = LabelEncoder::new();
820 let col: Vec<L> = data.iter().map(|row| row[j].clone()).collect();
821 enc.fit(&col);
822 self.encoders.push(enc);
823 }
824 }
825
826 pub fn transform(&self, data: &[Vec<L>]) -> CoreResult<Vec<Vec<usize>>> {
828 if self.encoders.is_empty() {
829 return Err(CoreError::InvalidState(ErrorContext::new(
830 "OrdinalEncoder not fitted",
831 )));
832 }
833 let n_rows = data.len();
834 let mut result = vec![vec![0usize; self.n_features]; n_rows];
835 for (j, enc) in self.encoders.iter().enumerate() {
836 let col_labels: Vec<L> = data.iter().map(|row| row[j].clone()).collect();
837 let indices = enc.transform(&col_labels)?;
838 for (i, idx) in indices.into_iter().enumerate() {
839 result[i][j] = idx;
840 }
841 }
842 Ok(result)
843 }
844
845 pub fn inverse_transform(&self, data: &[Vec<usize>]) -> CoreResult<Vec<Vec<L>>> {
847 if self.encoders.is_empty() {
848 return Err(CoreError::InvalidState(ErrorContext::new(
849 "OrdinalEncoder not fitted",
850 )));
851 }
852 let n_rows = data.len();
853 let mut result: Vec<Vec<L>> = Vec::with_capacity(n_rows);
854 for row in data {
855 let mut out_row = Vec::with_capacity(self.n_features);
856 for (j, enc) in self.encoders.iter().enumerate() {
857 let labels = enc.inverse_transform(&[row[j]])?;
858 out_row.push(labels.into_iter().next().ok_or_else(|| {
859 CoreError::ValueError(ErrorContext::new("Empty inverse_transform result"))
860 })?);
861 }
862 result.push(out_row);
863 }
864 Ok(result)
865 }
866
867 pub fn fit_transform(&mut self, data: &[Vec<L>]) -> CoreResult<Vec<Vec<usize>>> {
869 self.fit(data);
870 self.transform(data)
871 }
872}
873
874impl<L: Eq + Hash + Clone + Debug> Default for OrdinalEncoder<L> {
875 fn default() -> Self {
876 Self::new()
877 }
878}
879
880#[derive(Debug, Clone, Copy, PartialEq, Eq)]
886pub enum ImputeStrategy {
887 Mean,
889 Median,
891 Mode,
893 Constant,
895}
896
897#[derive(Debug, Clone)]
915pub struct Imputer<F: Float> {
916 pub strategy: ImputeStrategy,
918 pub fill_values: Option<Array1<F>>,
920 pub fill_constant: F,
922}
923
924impl<F: Float + FromPrimitive + Debug + Display + std::iter::Sum> Imputer<F> {
925 #[must_use]
929 pub fn new(strategy: ImputeStrategy, fill_constant: Option<F>) -> Self {
930 Self {
931 strategy,
932 fill_values: None,
933 fill_constant: fill_constant.unwrap_or_else(F::zero),
934 }
935 }
936
937 pub fn fit(&mut self, data: &Array2<F>) -> CoreResult<()> {
939 if data.nrows() == 0 {
940 return Err(CoreError::ValueError(ErrorContext::new(
941 "Cannot fit Imputer on empty data",
942 )));
943 }
944 let n_cols = data.ncols();
945 let mut fill_vals = Array1::<F>::zeros(n_cols);
946 for j in 0..n_cols {
947 let col = data.column(j);
948 let valid: Vec<F> = col.iter().copied().filter(|v| !v.is_nan()).collect();
949 if valid.is_empty() {
950 fill_vals[j] = self.fill_constant;
951 continue;
952 }
953 match self.strategy {
954 ImputeStrategy::Mean => {
955 let n = F::from_usize(valid.len()).unwrap_or_else(F::one);
956 let s: F = valid.iter().copied().sum();
957 fill_vals[j] = s / n;
958 }
959 ImputeStrategy::Median => {
960 let mut sorted = valid.clone();
961 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
962 fill_vals[j] =
963 compute_quantile(&sorted, F::from_f64(0.5).unwrap_or_else(F::zero));
964 }
965 ImputeStrategy::Mode => {
966 let factor = F::from_f64(1e6).unwrap_or_else(F::one);
968 let mut counts: HashMap<i64, (usize, F)> = HashMap::new();
969 for &v in &valid {
970 let key = NumCast::from(v * factor)
971 .map(|x: f64| x.round() as i64)
972 .unwrap_or(0);
973 let entry = counts.entry(key).or_insert((0, v));
974 entry.0 += 1;
975 }
976 let mode_val = counts
977 .values()
978 .max_by_key(|(count, _)| *count)
979 .map(|(_, v)| *v)
980 .unwrap_or_else(F::zero);
981 fill_vals[j] = mode_val;
982 }
983 ImputeStrategy::Constant => {
984 fill_vals[j] = self.fill_constant;
985 }
986 }
987 }
988 self.fill_values = Some(fill_vals);
989 Ok(())
990 }
991
992 pub fn transform(&self, data: &Array2<F>) -> CoreResult<Array2<F>> {
994 let fill = self
995 .fill_values
996 .as_ref()
997 .ok_or_else(|| CoreError::InvalidState(ErrorContext::new("Imputer not fitted")))?;
998 if data.ncols() != fill.len() {
999 return Err(CoreError::DimensionError(ErrorContext::new(format!(
1000 "Expected {} features, got {}",
1001 fill.len(),
1002 data.ncols()
1003 ))));
1004 }
1005 let mut result = data.clone();
1006 for j in 0..data.ncols() {
1007 for i in 0..data.nrows() {
1008 if result[[i, j]].is_nan() {
1009 result[[i, j]] = fill[j];
1010 }
1011 }
1012 }
1013 Ok(result)
1014 }
1015
1016 pub fn fit_transform(&mut self, data: &Array2<F>) -> CoreResult<Array2<F>> {
1018 self.fit(data)?;
1019 self.transform(data)
1020 }
1021}
1022
1023#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1029pub enum OutlierMethod {
1030 ZScore,
1032 Iqr,
1034}
1035
1036#[derive(Debug, Clone)]
1052pub struct OutlierDetector<F: Float> {
1053 pub method: OutlierMethod,
1055 pub threshold: F,
1057 zscore_params: Option<Vec<(F, F)>>,
1059 iqr_params: Option<Vec<(F, F, F)>>,
1061}
1062
1063impl<F: Float + FromPrimitive + Debug + Display + std::iter::Sum> OutlierDetector<F> {
1064 #[must_use]
1069 pub fn new(method: OutlierMethod, threshold: F) -> Self {
1070 Self {
1071 method,
1072 threshold,
1073 zscore_params: None,
1074 iqr_params: None,
1075 }
1076 }
1077
1078 pub fn fit(&mut self, data: &Array2<F>) -> CoreResult<()> {
1080 if data.nrows() == 0 {
1081 return Err(CoreError::ValueError(ErrorContext::new(
1082 "Cannot fit OutlierDetector on empty data",
1083 )));
1084 }
1085 let n_cols = data.ncols();
1086 match self.method {
1087 OutlierMethod::ZScore => {
1088 let mut params = Vec::with_capacity(n_cols);
1089 for j in 0..n_cols {
1090 let col = data.column(j);
1091 let n = F::from_usize(col.len()).unwrap_or_else(F::one);
1092 let sum: F = col.iter().copied().sum();
1093 let mean = sum / n;
1094 let var_sum: F = col.iter().map(|&x| (x - mean) * (x - mean)).sum();
1095 let std_dev = (var_sum / n).sqrt();
1096 params.push((mean, std_dev));
1097 }
1098 self.zscore_params = Some(params);
1099 }
1100 OutlierMethod::Iqr => {
1101 let mut params = Vec::with_capacity(n_cols);
1102 for j in 0..n_cols {
1103 let mut sorted: Vec<F> = data.column(j).iter().copied().collect();
1104 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1105 let q1 = compute_quantile(&sorted, F::from_f64(0.25).unwrap_or_else(F::zero));
1106 let q3 = compute_quantile(&sorted, F::from_f64(0.75).unwrap_or_else(F::zero));
1107 let iqr = q3 - q1;
1108 params.push((q1, q3, iqr));
1109 }
1110 self.iqr_params = Some(params);
1111 }
1112 }
1113 Ok(())
1114 }
1115
1116 pub fn detect(&self, data: &Array2<F>) -> CoreResult<Vec<bool>> {
1120 let n_rows = data.nrows();
1121 let mut mask = vec![false; n_rows];
1122 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
1123 match self.method {
1124 OutlierMethod::ZScore => {
1125 let params = self.zscore_params.as_ref().ok_or_else(|| {
1126 CoreError::InvalidState(ErrorContext::new("OutlierDetector not fitted"))
1127 })?;
1128 for j in 0..data.ncols() {
1129 let (mean, std_dev) = params[j];
1130 let s = if std_dev.abs() < eps {
1131 F::one()
1132 } else {
1133 std_dev
1134 };
1135 for i in 0..n_rows {
1136 let z = (data[[i, j]] - mean) / s;
1137 if z.abs() > self.threshold {
1138 mask[i] = true;
1139 }
1140 }
1141 }
1142 }
1143 OutlierMethod::Iqr => {
1144 let params = self.iqr_params.as_ref().ok_or_else(|| {
1145 CoreError::InvalidState(ErrorContext::new("OutlierDetector not fitted"))
1146 })?;
1147 for j in 0..data.ncols() {
1148 let (q1, q3, iqr) = params[j];
1149 let lower = q1 - self.threshold * iqr;
1150 let upper = q3 + self.threshold * iqr;
1151 for i in 0..n_rows {
1152 let v = data[[i, j]];
1153 if v < lower || v > upper {
1154 mask[i] = true;
1155 }
1156 }
1157 }
1158 }
1159 }
1160 Ok(mask)
1161 }
1162
1163 pub fn detect_per_feature(&self, data: &Array2<F>) -> CoreResult<Array2<bool>> {
1165 let n_rows = data.nrows();
1166 let n_cols = data.ncols();
1167 let mut mask = Array2::<bool>::default((n_rows, n_cols));
1168 let eps = F::from_f64(1e-10).unwrap_or_else(F::epsilon);
1169 match self.method {
1170 OutlierMethod::ZScore => {
1171 let params = self.zscore_params.as_ref().ok_or_else(|| {
1172 CoreError::InvalidState(ErrorContext::new("OutlierDetector not fitted"))
1173 })?;
1174 for j in 0..n_cols {
1175 let (mean, std_dev) = params[j];
1176 let s = if std_dev.abs() < eps {
1177 F::one()
1178 } else {
1179 std_dev
1180 };
1181 for i in 0..n_rows {
1182 let z = (data[[i, j]] - mean) / s;
1183 mask[[i, j]] = z.abs() > self.threshold;
1184 }
1185 }
1186 }
1187 OutlierMethod::Iqr => {
1188 let params = self.iqr_params.as_ref().ok_or_else(|| {
1189 CoreError::InvalidState(ErrorContext::new("OutlierDetector not fitted"))
1190 })?;
1191 for j in 0..n_cols {
1192 let (q1, q3, iqr) = params[j];
1193 let lower = q1 - self.threshold * iqr;
1194 let upper = q3 + self.threshold * iqr;
1195 for i in 0..n_rows {
1196 let v = data[[i, j]];
1197 mask[[i, j]] = v < lower || v > upper;
1198 }
1199 }
1200 }
1201 }
1202 Ok(mask)
1203 }
1204}
1205
1206fn compute_quantile<F: Float + FromPrimitive>(sorted: &[F], q: F) -> F {
1212 if sorted.is_empty() {
1213 return F::zero();
1214 }
1215 if sorted.len() == 1 {
1216 return sorted[0];
1217 }
1218 let n = sorted.len();
1219 let idx_f = q * F::from_usize(n - 1).unwrap_or_else(F::zero);
1220 let lower = NumCast::from(idx_f.floor()).unwrap_or(0usize);
1221 let upper = NumCast::from(idx_f.ceil()).unwrap_or(n - 1);
1222 let lower = lower.min(n - 1);
1223 let upper = upper.min(n - 1);
1224 if lower == upper {
1225 return sorted[lower];
1226 }
1227 let frac = idx_f - F::from_usize(lower).unwrap_or_else(F::zero);
1228 sorted[lower] * (F::one() - frac) + sorted[upper] * frac
1229}
1230
1231#[cfg(test)]
1236mod tests {
1237 use super::*;
1238 use ::ndarray::array;
1239
1240 const EPS: f64 = 1e-6;
1241
1242 #[test]
1243 fn test_standard_scaler_basic() {
1244 let data = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
1245 let mut scaler = StandardScaler::<f64>::new();
1246 scaler.fit(&data).expect("fit");
1247 let transformed = scaler.transform(&data).expect("transform");
1248 for j in 0..2 {
1250 let col_mean: f64 = transformed.column(j).iter().sum::<f64>() / 3.0;
1251 assert!(col_mean.abs() < EPS, "col {} mean = {}", j, col_mean);
1252 }
1253 }
1254
1255 #[test]
1256 fn test_standard_scaler_inverse() {
1257 let data = array![[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]];
1258 let mut scaler = StandardScaler::<f64>::new();
1259 scaler.fit(&data).expect("fit");
1260 let t = scaler.transform(&data).expect("transform");
1261 let inv = scaler.inverse_transform(&t).expect("inverse");
1262 for i in 0..3 {
1263 for j in 0..2 {
1264 assert!(
1265 (inv[[i, j]] - data[[i, j]]).abs() < EPS,
1266 "mismatch at [{}, {}]",
1267 i,
1268 j
1269 );
1270 }
1271 }
1272 }
1273
1274 #[test]
1275 fn test_standard_scaler_empty_error() {
1276 let data = Array2::<f64>::zeros((0, 3));
1277 let mut scaler = StandardScaler::<f64>::new();
1278 assert!(scaler.fit(&data).is_err());
1279 }
1280
1281 #[test]
1282 fn test_minmax_scaler_basic() {
1283 let data = array![[1.0, 10.0], [2.0, 20.0], [3.0, 30.0]];
1284 let mut scaler = MinMaxScaler::<f64>::new(0.0, 1.0);
1285 scaler.fit(&data).expect("fit");
1286 let t = scaler.transform(&data).expect("transform");
1287 assert!((t[[0, 0]] - 0.0).abs() < EPS);
1288 assert!((t[[2, 0]] - 1.0).abs() < EPS);
1289 assert!((t[[0, 1]] - 0.0).abs() < EPS);
1290 assert!((t[[2, 1]] - 1.0).abs() < EPS);
1291 }
1292
1293 #[test]
1294 fn test_minmax_scaler_custom_range() {
1295 let data = array![[0.0], [5.0], [10.0]];
1296 let mut scaler = MinMaxScaler::<f64>::new(-1.0, 1.0);
1297 scaler.fit(&data).expect("fit");
1298 let t = scaler.transform(&data).expect("transform");
1299 assert!((t[[0, 0]] - (-1.0)).abs() < EPS);
1300 assert!((t[[1, 0]] - 0.0).abs() < EPS);
1301 assert!((t[[2, 0]] - 1.0).abs() < EPS);
1302 }
1303
1304 #[test]
1305 fn test_minmax_scaler_inverse() {
1306 let data = array![[2.0, 4.0], [6.0, 8.0]];
1307 let mut scaler = MinMaxScaler::<f64>::new(0.0, 1.0);
1308 scaler.fit(&data).expect("fit");
1309 let t = scaler.transform(&data).expect("transform");
1310 let inv = scaler.inverse_transform(&t).expect("inverse");
1311 for i in 0..2 {
1312 for j in 0..2 {
1313 assert!((inv[[i, j]] - data[[i, j]]).abs() < EPS);
1314 }
1315 }
1316 }
1317
1318 #[test]
1319 fn test_robust_scaler_basic() {
1320 let data = array![[1.0], [2.0], [3.0], [4.0], [100.0]];
1321 let mut scaler = RobustScaler::<f64>::new();
1322 scaler.fit(&data).expect("fit");
1323 let t = scaler.transform(&data).expect("transform");
1324 assert!((t[[2, 0]]).abs() < EPS);
1326 }
1327
1328 #[test]
1329 fn test_max_abs_scaler_basic() {
1330 let data = array![[-3.0, 2.0], [1.0, -5.0]];
1331 let mut scaler = MaxAbsScaler::<f64>::new();
1332 scaler.fit(&data).expect("fit");
1333 let t = scaler.transform(&data).expect("transform");
1334 assert!((t[[0, 0]] - (-1.0)).abs() < EPS);
1336 assert!((t[[1, 1]] - (-1.0)).abs() < EPS);
1337 }
1338
1339 #[test]
1340 fn test_max_abs_scaler_inverse() {
1341 let data = array![[4.0, -8.0], [-2.0, 6.0]];
1342 let mut scaler = MaxAbsScaler::<f64>::new();
1343 scaler.fit(&data).expect("fit");
1344 let t = scaler.transform(&data).expect("transform");
1345 let inv = scaler.inverse_transform(&t).expect("inverse");
1346 for i in 0..2 {
1347 for j in 0..2 {
1348 assert!((inv[[i, j]] - data[[i, j]]).abs() < EPS);
1349 }
1350 }
1351 }
1352
1353 #[test]
1354 fn test_label_encoder() {
1355 let labels = vec!["cat", "dog", "cat", "bird", "dog"];
1356 let mut enc = LabelEncoder::new();
1357 enc.fit(&labels);
1358 assert_eq!(enc.n_classes(), 3);
1359 let encoded = enc.transform(&labels).expect("transform");
1360 assert_eq!(encoded[0], encoded[2]); assert_eq!(encoded[1], encoded[4]); let decoded = enc.inverse_transform(&encoded).expect("inverse");
1363 assert_eq!(decoded, labels);
1364 }
1365
1366 #[test]
1367 fn test_label_encoder_unknown() {
1368 let labels = vec!["a", "b"];
1369 let mut enc = LabelEncoder::new();
1370 enc.fit(&labels);
1371 let result = enc.transform(&["c"]);
1372 assert!(result.is_err());
1373 }
1374
1375 #[test]
1376 fn test_one_hot_encoder() {
1377 let data = vec![
1378 vec!["red", "small"],
1379 vec!["blue", "large"],
1380 vec!["red", "large"],
1381 ];
1382 let mut enc = OneHotEncoder::new();
1383 enc.fit(&data);
1384 let encoded = enc.transform(&data).expect("transform");
1385 assert_eq!(encoded.nrows(), 3);
1386 assert_eq!(encoded.ncols(), 4); for i in 0..3 {
1389 let row_sum: f64 = encoded.row(i).iter().sum();
1390 assert!((row_sum - 2.0).abs() < EPS);
1391 }
1392 }
1393
1394 #[test]
1395 fn test_ordinal_encoder() {
1396 let data = vec![vec!["a", "x"], vec!["b", "y"], vec!["a", "y"]];
1397 let mut enc = OrdinalEncoder::new();
1398 let encoded = enc.fit_transform(&data).expect("transform");
1399 assert_eq!(encoded[0][0], encoded[2][0]); let decoded = enc.inverse_transform(&encoded).expect("inverse");
1401 assert_eq!(decoded, data);
1402 }
1403
1404 #[test]
1405 fn test_imputer_mean() {
1406 let data = array![[1.0, f64::NAN], [3.0, 4.0], [5.0, 6.0]];
1407 let mut imp = Imputer::<f64>::new(ImputeStrategy::Mean, None);
1408 imp.fit(&data).expect("fit");
1409 let filled = imp.transform(&data).expect("transform");
1410 assert!(!filled[[0, 1]].is_nan());
1411 assert!((filled[[0, 1]] - 5.0).abs() < EPS);
1413 }
1414
1415 #[test]
1416 fn test_imputer_median() {
1417 let data = array![[f64::NAN, 1.0], [2.0, 3.0], [4.0, 5.0], [6.0, 7.0]];
1418 let mut imp = Imputer::<f64>::new(ImputeStrategy::Median, None);
1419 imp.fit(&data).expect("fit");
1420 let filled = imp.transform(&data).expect("transform");
1421 assert!(!filled[[0, 0]].is_nan());
1422 assert!((filled[[0, 0]] - 4.0).abs() < EPS);
1424 }
1425
1426 #[test]
1427 fn test_imputer_constant() {
1428 let data = array![[1.0, f64::NAN], [f64::NAN, 4.0]];
1429 let mut imp = Imputer::<f64>::new(ImputeStrategy::Constant, Some(-999.0));
1430 imp.fit(&data).expect("fit");
1431 let filled = imp.transform(&data).expect("transform");
1432 assert!((filled[[0, 1]] - (-999.0)).abs() < EPS);
1433 assert!((filled[[1, 0]] - (-999.0)).abs() < EPS);
1434 }
1435
1436 #[test]
1437 fn test_outlier_zscore() {
1438 let data = array![
1440 [1.0],
1441 [2.0],
1442 [3.0],
1443 [2.0],
1444 [1.5],
1445 [2.5],
1446 [3.0],
1447 [2.0],
1448 [1.0],
1449 [2.0],
1450 [3.0],
1451 [2.5],
1452 [1.5],
1453 [2.0],
1454 [2.5],
1455 [100.0]
1456 ];
1457 let mut det = OutlierDetector::<f64>::new(OutlierMethod::ZScore, 2.0);
1458 det.fit(&data).expect("fit");
1459 let mask = det.detect(&data).expect("detect");
1460 assert!(mask[15]);
1462 assert!(!mask[0]);
1464 assert!(!mask[1]);
1465 }
1466
1467 #[test]
1468 fn test_outlier_iqr() {
1469 let data = array![[1.0], [2.0], [3.0], [4.0], [5.0], [100.0]];
1470 let mut det = OutlierDetector::<f64>::new(OutlierMethod::Iqr, 1.5);
1471 det.fit(&data).expect("fit");
1472 let mask = det.detect(&data).expect("detect");
1473 assert!(mask[5]); assert!(!mask[0]);
1475 }
1476
1477 #[test]
1478 fn test_outlier_per_feature() {
1479 let data = array![[1.0, 10.0], [2.0, 20.0], [3.0, 100.0]];
1480 let mut det = OutlierDetector::<f64>::new(OutlierMethod::ZScore, 1.0);
1481 det.fit(&data).expect("fit");
1482 let mask = det.detect_per_feature(&data).expect("detect");
1483 assert_eq!(mask.nrows(), 3);
1484 assert_eq!(mask.ncols(), 2);
1485 }
1486
1487 #[test]
1488 fn test_standard_scaler_f32() {
1489 let data = array![[1.0f32, 2.0], [3.0, 4.0], [5.0, 6.0]];
1490 let mut scaler = StandardScaler::<f32>::new();
1491 scaler.fit(&data).expect("fit");
1492 let t = scaler.transform(&data).expect("transform");
1493 let col_mean: f32 = t.column(0).iter().sum::<f32>() / 3.0;
1494 assert!(col_mean.abs() < 1e-4);
1495 }
1496
1497 #[test]
1498 fn test_compute_quantile() {
1499 let sorted = vec![1.0f64, 2.0, 3.0, 4.0, 5.0];
1500 assert!((compute_quantile(&sorted, 0.0) - 1.0).abs() < EPS);
1501 assert!((compute_quantile(&sorted, 0.5) - 3.0).abs() < EPS);
1502 assert!((compute_quantile(&sorted, 1.0) - 5.0).abs() < EPS);
1503 assert!((compute_quantile(&sorted, 0.25) - 2.0).abs() < EPS);
1504 }
1505
1506 #[test]
1507 fn test_constant_feature_standard_scaler() {
1508 let data = array![[5.0], [5.0], [5.0]];
1510 let mut scaler = StandardScaler::<f64>::new();
1511 scaler.fit(&data).expect("fit");
1512 let t = scaler.transform(&data).expect("transform");
1513 assert!(!t[[0, 0]].is_nan());
1514 }
1515
1516 #[test]
1517 fn test_constant_feature_minmax() {
1518 let data = array![[5.0], [5.0], [5.0]];
1519 let mut scaler = MinMaxScaler::<f64>::new(0.0, 1.0);
1520 scaler.fit(&data).expect("fit");
1521 let t = scaler.transform(&data).expect("transform");
1522 assert!(!t[[0, 0]].is_nan());
1523 }
1524
1525 #[test]
1526 fn test_fit_transform_shortcut() {
1527 let data = array![[1.0, 2.0], [3.0, 4.0]];
1528 let mut scaler = StandardScaler::<f64>::new();
1529 let t = scaler.fit_transform(&data).expect("fit_transform");
1530 assert_eq!(t.shape(), &[2, 2]);
1531 }
1532
1533 #[test]
1534 fn test_dimension_mismatch_error() {
1535 let train = array![[1.0, 2.0], [3.0, 4.0]];
1536 let test_data = array![[1.0, 2.0, 3.0]];
1537 let mut scaler = StandardScaler::<f64>::new();
1538 scaler.fit(&train).expect("fit");
1539 assert!(scaler.transform(&test_data).is_err());
1540 }
1541
1542 #[test]
1543 fn test_not_fitted_error() {
1544 let data = array![[1.0]];
1545 let scaler = StandardScaler::<f64>::new();
1546 assert!(scaler.transform(&data).is_err());
1547 }
1548
1549 #[test]
1550 fn test_label_encoder_fit_transform() {
1551 let labels = vec![10, 20, 30, 20, 10];
1552 let mut enc = LabelEncoder::new();
1553 let encoded = enc.fit_transform(&labels);
1554 assert_eq!(encoded[0], encoded[4]);
1555 assert_eq!(encoded[1], encoded[3]);
1556 assert_ne!(encoded[0], encoded[1]);
1557 }
1558
1559 #[test]
1560 fn test_imputer_mode() {
1561 let data = array![[1.0, f64::NAN], [2.0, 3.0], [2.0, 3.0], [3.0, 5.0]];
1562 let mut imp = Imputer::<f64>::new(ImputeStrategy::Mode, None);
1563 imp.fit(&data).expect("fit");
1564 let filled = imp.transform(&data).expect("transform");
1565 assert!(
1567 (filled[[0, 0]] - 1.0).abs() < EPS
1568 || (filled[[0, 0]] - 2.0).abs() < EPS
1569 || (filled[[0, 0]] - 3.0).abs() < EPS
1570 );
1571 assert!((filled[[0, 1]] - 3.0).abs() < EPS);
1573 }
1574}