1use scirs2_core::ndarray::{Array2, ArrayBase, Data, Ix2};
7use scirs2_core::numeric::{Float, NumCast};
8use std::collections::HashMap;
9
10use crate::error::{Result, TransformError};
11
12#[derive(Debug, Clone)]
14pub struct SparseMatrix {
15 pub shape: (usize, usize),
17 pub row_indices: Vec<usize>,
19 pub col_indices: Vec<usize>,
21 pub values: Vec<f64>,
23}
24
25impl SparseMatrix {
26 pub fn new(shape: (usize, usize)) -> Self {
28 SparseMatrix {
29 shape,
30 row_indices: Vec::new(),
31 col_indices: Vec::new(),
32 values: Vec::new(),
33 }
34 }
35
36 pub fn push(&mut self, row: usize, col: usize, value: f64) {
38 if row < self.shape.0 && col < self.shape.1 && value != 0.0 {
39 self.row_indices.push(row);
40 self.col_indices.push(col);
41 self.values.push(value);
42 }
43 }
44
45 pub fn to_dense(&self) -> Array2<f64> {
47 let mut dense = Array2::zeros(self.shape);
48 for ((&row, &col), &val) in self
49 .row_indices
50 .iter()
51 .zip(self.col_indices.iter())
52 .zip(self.values.iter())
53 {
54 dense[[row, col]] = val;
55 }
56 dense
57 }
58
59 pub fn nnz(&self) -> usize {
61 self.values.len()
62 }
63}
64
65#[derive(Debug, Clone)]
67pub enum EncodedOutput {
68 Dense(Array2<f64>),
70 Sparse(SparseMatrix),
72}
73
74impl EncodedOutput {
75 pub fn to_dense(&self) -> Array2<f64> {
77 match self {
78 EncodedOutput::Dense(arr) => arr.clone(),
79 EncodedOutput::Sparse(sparse) => sparse.to_dense(),
80 }
81 }
82
83 pub fn shape(&self) -> (usize, usize) {
85 match self {
86 EncodedOutput::Dense(arr) => (arr.nrows(), arr.ncols()),
87 EncodedOutput::Sparse(sparse) => sparse.shape,
88 }
89 }
90}
91
92pub struct OneHotEncoder {
97 categories_: Option<Vec<Vec<u64>>>,
99 drop: Option<String>,
101 handleunknown: String,
103 sparse: bool,
105}
106
107impl OneHotEncoder {
108 pub fn new(_drop: Option<String>, handleunknown: &str, sparse: bool) -> Result<Self> {
118 if let Some(ref drop_strategy) = _drop {
119 if drop_strategy != "first" && drop_strategy != "if_binary" {
120 return Err(TransformError::InvalidInput(
121 "_drop must be 'first', 'if_binary', or None".to_string(),
122 ));
123 }
124 }
125
126 if handleunknown != "error" && handleunknown != "ignore" {
127 return Err(TransformError::InvalidInput(
128 "handleunknown must be 'error' or 'ignore'".to_string(),
129 ));
130 }
131
132 Ok(OneHotEncoder {
133 categories_: None,
134 drop: _drop,
135 handleunknown: handleunknown.to_string(),
136 sparse,
137 })
138 }
139
140 pub fn with_defaults() -> Self {
142 Self::new(None, "error", false).expect("Operation failed")
143 }
144
145 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
153 where
154 S: Data,
155 S::Elem: Float + NumCast,
156 {
157 let x_u64 = x.mapv(|x| {
158 let val_f64 = NumCast::from(x).unwrap_or(0.0);
159 val_f64 as u64
160 });
161
162 let n_samples = x_u64.shape()[0];
163 let n_features = x_u64.shape()[1];
164
165 if n_samples == 0 || n_features == 0 {
166 return Err(TransformError::InvalidInput("Empty input data".to_string()));
167 }
168
169 let mut categories = Vec::with_capacity(n_features);
170
171 for j in 0..n_features {
172 let mut unique_values: Vec<u64> = x_u64.column(j).to_vec();
174 unique_values.sort_unstable();
175 unique_values.dedup();
176
177 categories.push(unique_values);
178 }
179
180 self.categories_ = Some(categories);
181 Ok(())
182 }
183
184 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<EncodedOutput>
192 where
193 S: Data,
194 S::Elem: Float + NumCast,
195 {
196 let x_u64 = x.mapv(|x| {
197 let val_f64 = NumCast::from(x).unwrap_or(0.0);
198 val_f64 as u64
199 });
200
201 let n_samples = x_u64.shape()[0];
202 let n_features = x_u64.shape()[1];
203
204 if self.categories_.is_none() {
205 return Err(TransformError::TransformationError(
206 "OneHotEncoder has not been fitted".to_string(),
207 ));
208 }
209
210 let categories = self.categories_.as_ref().expect("Operation failed");
211
212 if n_features != categories.len() {
213 return Err(TransformError::InvalidInput(format!(
214 "x has {} features, but OneHotEncoder was fitted with {} features",
215 n_features,
216 categories.len()
217 )));
218 }
219
220 let mut total_features = 0;
222 for (j, feature_categories) in categories.iter().enumerate() {
223 let n_cats = feature_categories.len();
224
225 let n_output_cats = match &self.drop {
227 Some(strategy) if strategy == "first" => n_cats.saturating_sub(1),
228 Some(strategy) if strategy == "if_binary" && n_cats == 2 => 1,
229 _ => n_cats,
230 };
231
232 if n_output_cats == 0 {
233 return Err(TransformError::InvalidInput(format!(
234 "Feature {j} has only one category after dropping"
235 )));
236 }
237
238 total_features += n_output_cats;
239 }
240
241 let mut category_mappings = Vec::new();
243 let mut current_col = 0;
244
245 for feature_categories in categories.iter() {
246 let mut mapping = HashMap::new();
247 let n_cats = feature_categories.len();
248
249 let (start_idx, n_output_cats) = match &self.drop {
251 Some(strategy) if strategy == "first" => (1, n_cats.saturating_sub(1)),
252 Some(strategy) if strategy == "if_binary" && n_cats == 2 => (0, 1),
253 _ => (0, n_cats),
254 };
255
256 for (cat_idx, &category) in feature_categories.iter().enumerate() {
257 if cat_idx >= start_idx && cat_idx < start_idx + n_output_cats {
258 mapping.insert(category, current_col + cat_idx - start_idx);
259 }
260 }
261
262 category_mappings.push(mapping);
263 current_col += n_output_cats;
264 }
265
266 if self.sparse {
268 let mut sparse_matrix = SparseMatrix::new((n_samples, total_features));
270
271 for i in 0..n_samples {
272 for j in 0..n_features {
273 let value = x_u64[[i, j]];
274
275 if let Some(&col_idx) = category_mappings[j].get(&value) {
276 sparse_matrix.push(i, col_idx, 1.0);
277 } else {
278 let feature_categories = &categories[j];
280 let is_dropped_category = match &self.drop {
281 Some(strategy) if strategy == "first" => {
282 !feature_categories.is_empty() && value == feature_categories[0]
283 }
284 Some(strategy)
285 if strategy == "if_binary" && feature_categories.len() == 2 =>
286 {
287 feature_categories.len() == 2 && value == feature_categories[1]
288 }
289 _ => false,
290 };
291
292 if !is_dropped_category && self.handleunknown == "error" {
293 return Err(TransformError::InvalidInput(format!(
294 "Found unknown category {value} in feature {j}"
295 )));
296 }
297 }
299 }
300 }
301
302 Ok(EncodedOutput::Sparse(sparse_matrix))
303 } else {
304 let mut transformed = Array2::zeros((n_samples, total_features));
306
307 for i in 0..n_samples {
308 for j in 0..n_features {
309 let value = x_u64[[i, j]];
310
311 if let Some(&col_idx) = category_mappings[j].get(&value) {
312 transformed[[i, col_idx]] = 1.0;
313 } else {
314 let feature_categories = &categories[j];
316 let is_dropped_category = match &self.drop {
317 Some(strategy) if strategy == "first" => {
318 !feature_categories.is_empty() && value == feature_categories[0]
319 }
320 Some(strategy)
321 if strategy == "if_binary" && feature_categories.len() == 2 =>
322 {
323 feature_categories.len() == 2 && value == feature_categories[1]
324 }
325 _ => false,
326 };
327
328 if !is_dropped_category && self.handleunknown == "error" {
329 return Err(TransformError::InvalidInput(format!(
330 "Found unknown category {value} in feature {j}"
331 )));
332 }
333 }
335 }
336 }
337
338 Ok(EncodedOutput::Dense(transformed))
339 }
340 }
341
342 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<EncodedOutput>
350 where
351 S: Data,
352 S::Elem: Float + NumCast,
353 {
354 self.fit(x)?;
355 self.transform(x)
356 }
357
358 pub fn transform_dense<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
366 where
367 S: Data,
368 S::Elem: Float + NumCast,
369 {
370 Ok(self.transform(x)?.to_dense())
371 }
372
373 pub fn fit_transform_dense<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
381 where
382 S: Data,
383 S::Elem: Float + NumCast,
384 {
385 Ok(self.fit_transform(x)?.to_dense())
386 }
387
388 pub fn categories(&self) -> Option<&Vec<Vec<u64>>> {
393 self.categories_.as_ref()
394 }
395
396 pub fn get_feature_names(&self, inputfeatures: Option<&[String]>) -> Result<Vec<String>> {
404 if self.categories_.is_none() {
405 return Err(TransformError::TransformationError(
406 "OneHotEncoder has not been fitted".to_string(),
407 ));
408 }
409
410 let categories = self.categories_.as_ref().expect("Operation failed");
411 let mut feature_names = Vec::new();
412
413 for (j, feature_categories) in categories.iter().enumerate() {
414 let feature_name = if let Some(names) = inputfeatures {
415 if j < names.len() {
416 names[j].clone()
417 } else {
418 format!("x{j}")
419 }
420 } else {
421 format!("x{j}")
422 };
423
424 let n_cats = feature_categories.len();
425
426 let (start_idx, n_output_cats) = match &self.drop {
428 Some(strategy) if strategy == "first" => (1, n_cats.saturating_sub(1)),
429 Some(strategy) if strategy == "if_binary" && n_cats == 2 => (0, 1),
430 _ => (0, n_cats),
431 };
432
433 for &category in feature_categories
434 .iter()
435 .skip(start_idx)
436 .take(n_output_cats)
437 {
438 feature_names.push(format!("{feature_name}_cat_{category}"));
439 }
440 }
441
442 Ok(feature_names)
443 }
444}
445
446pub struct OrdinalEncoder {
451 categories_: Option<Vec<Vec<u64>>>,
453 handleunknown: String,
455 unknownvalue: Option<f64>,
457}
458
459impl OrdinalEncoder {
460 pub fn new(handleunknown: &str, unknownvalue: Option<f64>) -> Result<Self> {
469 if handleunknown != "error" && handleunknown != "use_encoded_value" {
470 return Err(TransformError::InvalidInput(
471 "handleunknown must be 'error' or 'use_encoded_value'".to_string(),
472 ));
473 }
474
475 if handleunknown == "use_encoded_value" && unknownvalue.is_none() {
476 return Err(TransformError::InvalidInput(
477 "unknownvalue must be specified when handleunknown='use_encoded_value'".to_string(),
478 ));
479 }
480
481 Ok(OrdinalEncoder {
482 categories_: None,
483 handleunknown: handleunknown.to_string(),
484 unknownvalue,
485 })
486 }
487
488 pub fn with_defaults() -> Self {
490 Self::new("error", None).expect("Operation failed")
491 }
492
493 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
501 where
502 S: Data,
503 S::Elem: Float + NumCast,
504 {
505 let x_u64 = x.mapv(|x| {
506 let val_f64 = NumCast::from(x).unwrap_or(0.0);
507 val_f64 as u64
508 });
509
510 let n_samples = x_u64.shape()[0];
511 let n_features = x_u64.shape()[1];
512
513 if n_samples == 0 || n_features == 0 {
514 return Err(TransformError::InvalidInput("Empty input data".to_string()));
515 }
516
517 let mut categories = Vec::with_capacity(n_features);
518
519 for j in 0..n_features {
520 let mut unique_values: Vec<u64> = x_u64.column(j).to_vec();
522 unique_values.sort_unstable();
523 unique_values.dedup();
524
525 categories.push(unique_values);
526 }
527
528 self.categories_ = Some(categories);
529 Ok(())
530 }
531
532 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
540 where
541 S: Data,
542 S::Elem: Float + NumCast,
543 {
544 let x_u64 = x.mapv(|x| {
545 let val_f64 = NumCast::from(x).unwrap_or(0.0);
546 val_f64 as u64
547 });
548
549 let n_samples = x_u64.shape()[0];
550 let n_features = x_u64.shape()[1];
551
552 if self.categories_.is_none() {
553 return Err(TransformError::TransformationError(
554 "OrdinalEncoder has not been fitted".to_string(),
555 ));
556 }
557
558 let categories = self.categories_.as_ref().expect("Operation failed");
559
560 if n_features != categories.len() {
561 return Err(TransformError::InvalidInput(format!(
562 "x has {} features, but OrdinalEncoder was fitted with {} features",
563 n_features,
564 categories.len()
565 )));
566 }
567
568 let mut transformed = Array2::zeros((n_samples, n_features));
569
570 let mut category_mappings = Vec::new();
572 for feature_categories in categories {
573 let mut mapping = HashMap::new();
574 for (ordinal, &category) in feature_categories.iter().enumerate() {
575 mapping.insert(category, ordinal as f64);
576 }
577 category_mappings.push(mapping);
578 }
579
580 for i in 0..n_samples {
582 for j in 0..n_features {
583 let value = x_u64[[i, j]];
584
585 if let Some(&ordinal_value) = category_mappings[j].get(&value) {
586 transformed[[i, j]] = ordinal_value;
587 } else if self.handleunknown == "error" {
588 return Err(TransformError::InvalidInput(format!(
589 "Found unknown category {value} in feature {j}"
590 )));
591 } else {
592 transformed[[i, j]] = self.unknownvalue.expect("Operation failed");
594 }
595 }
596 }
597
598 Ok(transformed)
599 }
600
601 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
609 where
610 S: Data,
611 S::Elem: Float + NumCast,
612 {
613 self.fit(x)?;
614 self.transform(x)
615 }
616
617 pub fn categories(&self) -> Option<&Vec<Vec<u64>>> {
622 self.categories_.as_ref()
623 }
624}
625
626#[derive(Debug, Clone)]
644pub struct TargetEncoder {
645 strategy: String,
647 smoothing: f64,
649 globalstat: f64,
651 encodings_: Option<Vec<HashMap<u64, f64>>>,
653 is_fitted: bool,
655 global_mean_: f64,
657}
658
659impl TargetEncoder {
660 pub fn new(_strategy: &str, smoothing: f64, globalstat: f64) -> Result<Self> {
670 if !["mean", "median", "count", "sum"].contains(&_strategy) {
671 return Err(TransformError::InvalidInput(
672 "_strategy must be 'mean', 'median', 'count', or 'sum'".to_string(),
673 ));
674 }
675
676 if smoothing < 0.0 {
677 return Err(TransformError::InvalidInput(
678 "smoothing parameter must be non-negative".to_string(),
679 ));
680 }
681
682 Ok(TargetEncoder {
683 strategy: _strategy.to_string(),
684 smoothing,
685 globalstat,
686 encodings_: None,
687 is_fitted: false,
688 global_mean_: 0.0,
689 })
690 }
691
692 pub fn with_mean(smoothing: f64) -> Self {
694 TargetEncoder {
695 strategy: "mean".to_string(),
696 smoothing,
697 globalstat: 0.0,
698 encodings_: None,
699 is_fitted: false,
700 global_mean_: 0.0,
701 }
702 }
703
704 pub fn with_median(smoothing: f64) -> Self {
706 TargetEncoder {
707 strategy: "median".to_string(),
708 smoothing,
709 globalstat: 0.0,
710 encodings_: None,
711 is_fitted: false,
712 global_mean_: 0.0,
713 }
714 }
715
716 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>, y: &[f64]) -> Result<()>
725 where
726 S: Data,
727 S::Elem: Float + NumCast,
728 {
729 let x_u64 = x.mapv(|x| {
730 let val_f64 = NumCast::from(x).unwrap_or(0.0);
731 val_f64 as u64
732 });
733
734 let n_samples = x_u64.shape()[0];
735 let n_features = x_u64.shape()[1];
736
737 if n_samples == 0 || n_features == 0 {
738 return Err(TransformError::InvalidInput("Empty input data".to_string()));
739 }
740
741 if y.len() != n_samples {
742 return Err(TransformError::InvalidInput(
743 "Number of target values must match number of samples".to_string(),
744 ));
745 }
746
747 self.global_mean_ = y.iter().sum::<f64>() / y.len() as f64;
749
750 let mut encodings = Vec::with_capacity(n_features);
751
752 for j in 0..n_features {
753 let mut category_targets: HashMap<u64, Vec<f64>> = HashMap::new();
755
756 for i in 0..n_samples {
757 let category = x_u64[[i, j]];
758 category_targets.entry(category).or_default().push(y[i]);
759 }
760
761 let mut category_encoding = HashMap::new();
763
764 for (category, targets) in category_targets.iter() {
765 let encoded_value = match self.strategy.as_str() {
766 "mean" => {
767 let category_mean = targets.iter().sum::<f64>() / targets.len() as f64;
768 let count = targets.len() as f64;
769
770 if self.smoothing > 0.0 {
772 (count * category_mean + self.smoothing * self.global_mean_)
773 / (count + self.smoothing)
774 } else {
775 category_mean
776 }
777 }
778 "median" => {
779 let mut sorted_targets = targets.clone();
780 sorted_targets.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
781
782 let median = if sorted_targets.len() % 2 == 0 {
783 let mid = sorted_targets.len() / 2;
784 (sorted_targets[mid - 1] + sorted_targets[mid]) / 2.0
785 } else {
786 sorted_targets[sorted_targets.len() / 2]
787 };
788
789 if self.smoothing > 0.0 {
791 let count = targets.len() as f64;
792 (count * median + self.smoothing * self.global_mean_)
793 / (count + self.smoothing)
794 } else {
795 median
796 }
797 }
798 "count" => targets.len() as f64,
799 "sum" => targets.iter().sum::<f64>(),
800 _ => unreachable!(),
801 };
802
803 category_encoding.insert(*category, encoded_value);
804 }
805
806 encodings.push(category_encoding);
807 }
808
809 self.encodings_ = Some(encodings);
810 self.is_fitted = true;
811 Ok(())
812 }
813
814 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
822 where
823 S: Data,
824 S::Elem: Float + NumCast,
825 {
826 if !self.is_fitted {
827 return Err(TransformError::TransformationError(
828 "TargetEncoder has not been fitted".to_string(),
829 ));
830 }
831
832 let x_u64 = x.mapv(|x| {
833 let val_f64 = NumCast::from(x).unwrap_or(0.0);
834 val_f64 as u64
835 });
836
837 let n_samples = x_u64.shape()[0];
838 let n_features = x_u64.shape()[1];
839
840 let encodings = self.encodings_.as_ref().expect("Operation failed");
841
842 if n_features != encodings.len() {
843 return Err(TransformError::InvalidInput(format!(
844 "x has {} features, but TargetEncoder was fitted with {} features",
845 n_features,
846 encodings.len()
847 )));
848 }
849
850 let mut transformed = Array2::zeros((n_samples, n_features));
851
852 for i in 0..n_samples {
853 for j in 0..n_features {
854 let category = x_u64[[i, j]];
855
856 if let Some(&encoded_value) = encodings[j].get(&category) {
857 transformed[[i, j]] = encoded_value;
858 } else {
859 transformed[[i, j]] = if self.globalstat != 0.0 {
861 self.globalstat
862 } else {
863 self.global_mean_
864 };
865 }
866 }
867 }
868
869 Ok(transformed)
870 }
871
872 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>, y: &[f64]) -> Result<Array2<f64>>
881 where
882 S: Data,
883 S::Elem: Float + NumCast,
884 {
885 self.fit(x, y)?;
886 self.transform(x)
887 }
888
889 pub fn encodings(&self) -> Option<&Vec<HashMap<u64, f64>>> {
894 self.encodings_.as_ref()
895 }
896
897 pub fn is_fitted(&self) -> bool {
899 self.is_fitted
900 }
901
902 pub fn global_mean(&self) -> f64 {
904 self.global_mean_
905 }
906
907 pub fn fit_transform_cv<S>(
921 &mut self,
922 x: &ArrayBase<S, Ix2>,
923 y: &[f64],
924 cv_folds: usize,
925 ) -> Result<Array2<f64>>
926 where
927 S: Data,
928 S::Elem: Float + NumCast,
929 {
930 let x_u64 = x.mapv(|x| {
931 let val_f64 = NumCast::from(x).unwrap_or(0.0);
932 val_f64 as u64
933 });
934
935 let n_samples = x_u64.shape()[0];
936 let n_features = x_u64.shape()[1];
937
938 if n_samples == 0 || n_features == 0 {
939 return Err(TransformError::InvalidInput("Empty input data".to_string()));
940 }
941
942 if y.len() != n_samples {
943 return Err(TransformError::InvalidInput(
944 "Number of target values must match number of samples".to_string(),
945 ));
946 }
947
948 if cv_folds < 2 {
949 return Err(TransformError::InvalidInput(
950 "cv_folds must be at least 2".to_string(),
951 ));
952 }
953
954 let mut transformed = Array2::zeros((n_samples, n_features));
955
956 self.global_mean_ = y.iter().sum::<f64>() / y.len() as f64;
958
959 let fold_size = n_samples / cv_folds;
961 let mut fold_indices = Vec::new();
962 for fold in 0..cv_folds {
963 let start = fold * fold_size;
964 let end = if fold == cv_folds - 1 {
965 n_samples
966 } else {
967 (fold + 1) * fold_size
968 };
969 fold_indices.push((start, end));
970 }
971
972 for fold in 0..cv_folds {
974 let (val_start, val_end) = fold_indices[fold];
975
976 let mut train_indices = Vec::new();
978 for (other_fold, &(start, end)) in fold_indices.iter().enumerate().take(cv_folds) {
979 if other_fold != fold {
980 train_indices.extend(start..end);
981 }
982 }
983
984 for j in 0..n_features {
986 let mut category_targets: HashMap<u64, Vec<f64>> = HashMap::new();
987
988 for &train_idx in &train_indices {
990 let category = x_u64[[train_idx, j]];
991 category_targets
992 .entry(category)
993 .or_default()
994 .push(y[train_idx]);
995 }
996
997 let mut category_encoding = HashMap::new();
999 for (category, targets) in category_targets.iter() {
1000 let encoded_value = match self.strategy.as_str() {
1001 "mean" => {
1002 let category_mean = targets.iter().sum::<f64>() / targets.len() as f64;
1003 let count = targets.len() as f64;
1004
1005 if self.smoothing > 0.0 {
1006 (count * category_mean + self.smoothing * self.global_mean_)
1007 / (count + self.smoothing)
1008 } else {
1009 category_mean
1010 }
1011 }
1012 "median" => {
1013 let mut sorted_targets = targets.clone();
1014 sorted_targets
1015 .sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
1016
1017 let median = if sorted_targets.len() % 2 == 0 {
1018 let mid = sorted_targets.len() / 2;
1019 (sorted_targets[mid - 1] + sorted_targets[mid]) / 2.0
1020 } else {
1021 sorted_targets[sorted_targets.len() / 2]
1022 };
1023
1024 if self.smoothing > 0.0 {
1025 let count = targets.len() as f64;
1026 (count * median + self.smoothing * self.global_mean_)
1027 / (count + self.smoothing)
1028 } else {
1029 median
1030 }
1031 }
1032 "count" => targets.len() as f64,
1033 "sum" => targets.iter().sum::<f64>(),
1034 _ => unreachable!(),
1035 };
1036
1037 category_encoding.insert(*category, encoded_value);
1038 }
1039
1040 for val_idx in val_start..val_end {
1042 let category = x_u64[[val_idx, j]];
1043
1044 if let Some(&encoded_value) = category_encoding.get(&category) {
1045 transformed[[val_idx, j]] = encoded_value;
1046 } else {
1047 transformed[[val_idx, j]] = self.global_mean_;
1049 }
1050 }
1051 }
1052 }
1053
1054 self.fit(x, y)?;
1056
1057 Ok(transformed)
1058 }
1059}
1060
1061#[derive(Debug, Clone)]
1069pub struct BinaryEncoder {
1070 categories_: Option<Vec<HashMap<u64, Vec<u8>>>>,
1072 n_binary_features_: Option<Vec<usize>>,
1074 handleunknown: String,
1076 is_fitted: bool,
1078}
1079
1080impl BinaryEncoder {
1081 pub fn new(handleunknown: &str) -> Result<Self> {
1091 if handleunknown != "error" && handleunknown != "ignore" {
1092 return Err(TransformError::InvalidInput(
1093 "handleunknown must be 'error' or 'ignore'".to_string(),
1094 ));
1095 }
1096
1097 Ok(BinaryEncoder {
1098 categories_: None,
1099 n_binary_features_: None,
1100 handleunknown: handleunknown.to_string(),
1101 is_fitted: false,
1102 })
1103 }
1104
1105 pub fn with_defaults() -> Self {
1107 Self::new("error").expect("Operation failed")
1108 }
1109
1110 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
1118 where
1119 S: Data,
1120 S::Elem: Float + NumCast,
1121 {
1122 let x_u64 = x.mapv(|x| {
1123 let val_f64 = NumCast::from(x).unwrap_or(0.0);
1124 val_f64 as u64
1125 });
1126
1127 let n_samples = x_u64.shape()[0];
1128 let n_features = x_u64.shape()[1];
1129
1130 if n_samples == 0 || n_features == 0 {
1131 return Err(TransformError::InvalidInput("Empty input data".to_string()));
1132 }
1133
1134 let mut categories = Vec::with_capacity(n_features);
1135 let mut n_binary_features = Vec::with_capacity(n_features);
1136
1137 for j in 0..n_features {
1138 let mut unique_categories: Vec<u64> = x_u64.column(j).to_vec();
1140 unique_categories.sort_unstable();
1141 unique_categories.dedup();
1142
1143 if unique_categories.is_empty() {
1144 return Err(TransformError::InvalidInput(
1145 "Feature has no valid categories".to_string(),
1146 ));
1147 }
1148
1149 let n_cats = unique_categories.len();
1151 let nbits = if n_cats <= 1 {
1152 1
1153 } else {
1154 (n_cats as f64).log2().ceil() as usize
1155 };
1156
1157 let mut category_map = HashMap::new();
1159 for (idx, &category) in unique_categories.iter().enumerate() {
1160 let binary_code = Self::int_to_binary(idx, nbits);
1161 category_map.insert(category, binary_code);
1162 }
1163
1164 categories.push(category_map);
1165 n_binary_features.push(nbits);
1166 }
1167
1168 self.categories_ = Some(categories);
1169 self.n_binary_features_ = Some(n_binary_features);
1170 self.is_fitted = true;
1171
1172 Ok(())
1173 }
1174
1175 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1183 where
1184 S: Data,
1185 S::Elem: Float + NumCast,
1186 {
1187 if !self.is_fitted {
1188 return Err(TransformError::InvalidInput(
1189 "Encoder has not been fitted yet".to_string(),
1190 ));
1191 }
1192
1193 let categories = self.categories_.as_ref().expect("Operation failed");
1194 let n_binary_features = self.n_binary_features_.as_ref().expect("Operation failed");
1195
1196 let x_u64 = x.mapv(|x| {
1197 let val_f64 = NumCast::from(x).unwrap_or(0.0);
1198 val_f64 as u64
1199 });
1200
1201 let n_samples = x_u64.shape()[0];
1202 let n_features = x_u64.shape()[1];
1203
1204 if n_features != categories.len() {
1205 return Err(TransformError::InvalidInput(format!(
1206 "Number of features ({}) does not match fitted features ({})",
1207 n_features,
1208 categories.len()
1209 )));
1210 }
1211
1212 let total_binary_features: usize = n_binary_features.iter().sum();
1214 let mut result = Array2::<f64>::zeros((n_samples, total_binary_features));
1215
1216 let mut output_col = 0;
1217 for j in 0..n_features {
1218 let category_map = &categories[j];
1219 let nbits = n_binary_features[j];
1220
1221 for i in 0..n_samples {
1222 let category = x_u64[[i, j]];
1223
1224 if let Some(binary_code) = category_map.get(&category) {
1225 for (bit_idx, &bit_val) in binary_code.iter().enumerate() {
1227 result[[i, output_col + bit_idx]] = bit_val as f64;
1228 }
1229 } else {
1230 match self.handleunknown.as_str() {
1232 "error" => {
1233 return Err(TransformError::InvalidInput(format!(
1234 "Unknown category {category} in feature {j}"
1235 )));
1236 }
1237 "ignore" => {
1238 }
1240 _ => unreachable!(),
1241 }
1242 }
1243 }
1244
1245 output_col += nbits;
1246 }
1247
1248 Ok(result)
1249 }
1250
1251 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1259 where
1260 S: Data,
1261 S::Elem: Float + NumCast,
1262 {
1263 self.fit(x)?;
1264 self.transform(x)
1265 }
1266
1267 pub fn is_fitted(&self) -> bool {
1269 self.is_fitted
1270 }
1271
1272 pub fn categories(&self) -> Option<&Vec<HashMap<u64, Vec<u8>>>> {
1274 self.categories_.as_ref()
1275 }
1276
1277 pub fn n_binary_features(&self) -> Option<&Vec<usize>> {
1279 self.n_binary_features_.as_ref()
1280 }
1281
1282 pub fn n_output_features(&self) -> Option<usize> {
1284 self.n_binary_features_.as_ref().map(|v| v.iter().sum())
1285 }
1286
1287 fn int_to_binary(_value: usize, nbits: usize) -> Vec<u8> {
1289 let mut binary = Vec::with_capacity(nbits);
1290 let mut val = _value;
1291
1292 for _ in 0..nbits {
1293 binary.push((val & 1) as u8);
1294 val >>= 1;
1295 }
1296
1297 binary.reverse(); binary
1299 }
1300}
1301
1302#[derive(Debug, Clone)]
1308pub struct FrequencyEncoder {
1309 frequency_maps_: Option<Vec<HashMap<u64, f64>>>,
1311 normalize: bool,
1313 handleunknown: String,
1315 unknownvalue: f64,
1317 is_fitted: bool,
1319}
1320
1321impl FrequencyEncoder {
1322 pub fn new(normalize: bool, handleunknown: &str, unknownvalue: f64) -> Result<Self> {
1332 if !["error", "ignore", "use_encoded_value"].contains(&handleunknown) {
1333 return Err(TransformError::InvalidInput(
1334 "handleunknown must be 'error', 'ignore', or 'use_encoded_value'".to_string(),
1335 ));
1336 }
1337
1338 Ok(FrequencyEncoder {
1339 frequency_maps_: None,
1340 normalize,
1341 handleunknown: handleunknown.to_string(),
1342 unknownvalue,
1343 is_fitted: false,
1344 })
1345 }
1346
1347 pub fn with_defaults() -> Self {
1349 Self::new(false, "error", 0.0).expect("Operation failed")
1350 }
1351
1352 pub fn with_normalization() -> Self {
1354 Self::new(true, "error", 0.0).expect("Operation failed")
1355 }
1356
1357 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
1365 where
1366 S: Data,
1367 S::Elem: Float + NumCast,
1368 {
1369 let x_u64 = x.mapv(|x| {
1370 let val_f64 = NumCast::from(x).unwrap_or(0.0);
1371 val_f64 as u64
1372 });
1373
1374 let n_samples = x_u64.shape()[0];
1375 let n_features = x_u64.shape()[1];
1376
1377 if n_samples == 0 || n_features == 0 {
1378 return Err(TransformError::InvalidInput("Empty input data".to_string()));
1379 }
1380
1381 let mut frequency_maps = Vec::with_capacity(n_features);
1382
1383 for j in 0..n_features {
1384 let mut category_counts: HashMap<u64, usize> = HashMap::new();
1386 for i in 0..n_samples {
1387 let category = x_u64[[i, j]];
1388 *category_counts.entry(category).or_insert(0) += 1;
1389 }
1390
1391 let mut frequency_map = HashMap::new();
1393 for (category, count) in category_counts {
1394 let frequency = if self.normalize {
1395 count as f64 / n_samples as f64
1396 } else {
1397 count as f64
1398 };
1399 frequency_map.insert(category, frequency);
1400 }
1401
1402 frequency_maps.push(frequency_map);
1403 }
1404
1405 self.frequency_maps_ = Some(frequency_maps);
1406 self.is_fitted = true;
1407 Ok(())
1408 }
1409
1410 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1418 where
1419 S: Data,
1420 S::Elem: Float + NumCast,
1421 {
1422 if !self.is_fitted {
1423 return Err(TransformError::TransformationError(
1424 "FrequencyEncoder has not been fitted".to_string(),
1425 ));
1426 }
1427
1428 let frequency_maps = self.frequency_maps_.as_ref().expect("Operation failed");
1429
1430 let x_u64 = x.mapv(|x| {
1431 let val_f64 = NumCast::from(x).unwrap_or(0.0);
1432 val_f64 as u64
1433 });
1434
1435 let n_samples = x_u64.shape()[0];
1436 let n_features = x_u64.shape()[1];
1437
1438 if n_features != frequency_maps.len() {
1439 return Err(TransformError::InvalidInput(format!(
1440 "x has {} features, but FrequencyEncoder was fitted with {} features",
1441 n_features,
1442 frequency_maps.len()
1443 )));
1444 }
1445
1446 let mut transformed = Array2::zeros((n_samples, n_features));
1447
1448 for i in 0..n_samples {
1449 for j in 0..n_features {
1450 let category = x_u64[[i, j]];
1451
1452 if let Some(&frequency) = frequency_maps[j].get(&category) {
1453 transformed[[i, j]] = frequency;
1454 } else {
1455 match self.handleunknown.as_str() {
1457 "error" => {
1458 return Err(TransformError::InvalidInput(format!(
1459 "Unknown category {category} in feature {j}"
1460 )));
1461 }
1462 "ignore" => {
1463 transformed[[i, j]] = 0.0;
1464 }
1465 "use_encoded_value" => {
1466 transformed[[i, j]] = self.unknownvalue;
1467 }
1468 _ => unreachable!(),
1469 }
1470 }
1471 }
1472 }
1473
1474 Ok(transformed)
1475 }
1476
1477 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1485 where
1486 S: Data,
1487 S::Elem: Float + NumCast,
1488 {
1489 self.fit(x)?;
1490 self.transform(x)
1491 }
1492
1493 pub fn is_fitted(&self) -> bool {
1495 self.is_fitted
1496 }
1497
1498 pub fn frequency_maps(&self) -> Option<&Vec<HashMap<u64, f64>>> {
1500 self.frequency_maps_.as_ref()
1501 }
1502}
1503
1504#[derive(Debug, Clone)]
1512pub struct WOEEncoder {
1513 woe_maps_: Option<Vec<HashMap<u64, f64>>>,
1515 information_values_: Option<Vec<f64>>,
1517 regularization: f64,
1519 handleunknown: String,
1521 unknownvalue: f64,
1523 global_woe_: f64,
1525 is_fitted: bool,
1527}
1528
1529impl WOEEncoder {
1530 pub fn new(regularization: f64, handleunknown: &str, unknownvalue: f64) -> Result<Self> {
1540 if regularization < 0.0 {
1541 return Err(TransformError::InvalidInput(
1542 "regularization must be non-negative".to_string(),
1543 ));
1544 }
1545
1546 if !["error", "global_woe", "use_encoded_value"].contains(&handleunknown) {
1547 return Err(TransformError::InvalidInput(
1548 "handleunknown must be 'error', 'global_woe', or 'use_encoded_value'".to_string(),
1549 ));
1550 }
1551
1552 Ok(WOEEncoder {
1553 woe_maps_: None,
1554 information_values_: None,
1555 regularization,
1556 handleunknown: handleunknown.to_string(),
1557 unknownvalue,
1558 global_woe_: 0.0,
1559 is_fitted: false,
1560 })
1561 }
1562
1563 pub fn with_defaults() -> Self {
1565 Self::new(0.5, "global_woe", 0.0).expect("Operation failed")
1566 }
1567
1568 pub fn with_regularization(regularization: f64) -> Result<Self> {
1570 Self::new(regularization, "global_woe", 0.0)
1571 }
1572
1573 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>, y: &[f64]) -> Result<()>
1582 where
1583 S: Data,
1584 S::Elem: Float + NumCast,
1585 {
1586 let x_u64 = x.mapv(|x| {
1587 let val_f64 = NumCast::from(x).unwrap_or(0.0);
1588 val_f64 as u64
1589 });
1590
1591 let n_samples = x_u64.shape()[0];
1592 let n_features = x_u64.shape()[1];
1593
1594 if n_samples == 0 || n_features == 0 {
1595 return Err(TransformError::InvalidInput("Empty input data".to_string()));
1596 }
1597
1598 if y.len() != n_samples {
1599 return Err(TransformError::InvalidInput(
1600 "Number of target values must match number of samples".to_string(),
1601 ));
1602 }
1603
1604 for &target in y {
1606 if target != 0.0 && target != 1.0 {
1607 return Err(TransformError::InvalidInput(
1608 "Target values must be binary (0 or 1)".to_string(),
1609 ));
1610 }
1611 }
1612
1613 let total_events: f64 = y.iter().sum();
1615 let total_non_events = n_samples as f64 - total_events;
1616
1617 if total_events == 0.0 || total_non_events == 0.0 {
1618 return Err(TransformError::InvalidInput(
1619 "Target must contain both 0 and 1 values".to_string(),
1620 ));
1621 }
1622
1623 self.global_woe_ = (total_events / total_non_events).ln();
1625
1626 let mut woe_maps = Vec::with_capacity(n_features);
1627 let mut information_values = Vec::with_capacity(n_features);
1628
1629 for j in 0..n_features {
1630 let mut category_stats: HashMap<u64, (f64, f64)> = HashMap::new(); for i in 0..n_samples {
1634 let category = x_u64[[i, j]];
1635 let target = y[i];
1636
1637 let (events, non_events) = category_stats.entry(category).or_insert((0.0, 0.0));
1638 if target == 1.0 {
1639 *events += 1.0;
1640 } else {
1641 *non_events += 1.0;
1642 }
1643 }
1644
1645 let mut woe_map = HashMap::new();
1647 let mut feature_iv = 0.0;
1648
1649 for (category, (events, non_events)) in category_stats.iter() {
1650 let reg_events = events + self.regularization;
1652 let reg_non_events = non_events + self.regularization;
1653 let reg_total_events =
1654 total_events + self.regularization * category_stats.len() as f64;
1655 let reg_total_non_events =
1656 total_non_events + self.regularization * category_stats.len() as f64;
1657
1658 let event_rate = reg_events / reg_total_events;
1660 let non_event_rate = reg_non_events / reg_total_non_events;
1661
1662 let woe = (event_rate / non_event_rate).ln();
1664 woe_map.insert(*category, woe);
1665
1666 let iv_contribution = (event_rate - non_event_rate) * woe;
1668 feature_iv += iv_contribution;
1669 }
1670
1671 woe_maps.push(woe_map);
1672 information_values.push(feature_iv);
1673 }
1674
1675 self.woe_maps_ = Some(woe_maps);
1676 self.information_values_ = Some(information_values);
1677 self.is_fitted = true;
1678 Ok(())
1679 }
1680
1681 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1689 where
1690 S: Data,
1691 S::Elem: Float + NumCast,
1692 {
1693 if !self.is_fitted {
1694 return Err(TransformError::TransformationError(
1695 "WOEEncoder has not been fitted".to_string(),
1696 ));
1697 }
1698
1699 let woe_maps = self.woe_maps_.as_ref().expect("Operation failed");
1700
1701 let x_u64 = x.mapv(|x| {
1702 let val_f64 = NumCast::from(x).unwrap_or(0.0);
1703 val_f64 as u64
1704 });
1705
1706 let n_samples = x_u64.shape()[0];
1707 let n_features = x_u64.shape()[1];
1708
1709 if n_features != woe_maps.len() {
1710 return Err(TransformError::InvalidInput(format!(
1711 "x has {} features, but WOEEncoder was fitted with {} features",
1712 n_features,
1713 woe_maps.len()
1714 )));
1715 }
1716
1717 let mut transformed = Array2::zeros((n_samples, n_features));
1718
1719 for i in 0..n_samples {
1720 for j in 0..n_features {
1721 let category = x_u64[[i, j]];
1722
1723 if let Some(&woe_value) = woe_maps[j].get(&category) {
1724 transformed[[i, j]] = woe_value;
1725 } else {
1726 match self.handleunknown.as_str() {
1728 "error" => {
1729 return Err(TransformError::InvalidInput(format!(
1730 "Unknown category {category} in feature {j}"
1731 )));
1732 }
1733 "global_woe" => {
1734 transformed[[i, j]] = self.global_woe_;
1735 }
1736 "use_encoded_value" => {
1737 transformed[[i, j]] = self.unknownvalue;
1738 }
1739 _ => unreachable!(),
1740 }
1741 }
1742 }
1743 }
1744
1745 Ok(transformed)
1746 }
1747
1748 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>, y: &[f64]) -> Result<Array2<f64>>
1757 where
1758 S: Data,
1759 S::Elem: Float + NumCast,
1760 {
1761 self.fit(x, y)?;
1762 self.transform(x)
1763 }
1764
1765 pub fn is_fitted(&self) -> bool {
1767 self.is_fitted
1768 }
1769
1770 pub fn woe_maps(&self) -> Option<&Vec<HashMap<u64, f64>>> {
1772 self.woe_maps_.as_ref()
1773 }
1774
1775 pub fn information_values(&self) -> Option<&Vec<f64>> {
1784 self.information_values_.as_ref()
1785 }
1786
1787 pub fn global_woe(&self) -> f64 {
1789 self.global_woe_
1790 }
1791
1792 pub fn feature_importance_ranking(&self) -> Option<Vec<(usize, f64)>> {
1797 self.information_values_.as_ref().map(|ivs| {
1798 let mut ranking: Vec<(usize, f64)> =
1799 ivs.iter().enumerate().map(|(idx, &iv)| (idx, iv)).collect();
1800 ranking.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
1801 ranking
1802 })
1803 }
1804}
1805
1806#[cfg(test)]
1807#[path = "encoding_tests.rs"]
1808mod tests;