scirs2_transform/
encoding.rs

1//! Categorical data encoding utilities
2//!
3//! This module provides methods for encoding categorical data into numerical
4//! formats suitable for machine learning algorithms.
5
6use scirs2_core::ndarray::{Array2, ArrayBase, Data, Ix2};
7use scirs2_core::numeric::{Float, NumCast};
8use std::collections::HashMap;
9
10use crate::error::{Result, TransformError};
11
12/// Simple sparse matrix representation in COO (Coordinate) format
13#[derive(Debug, Clone)]
14pub struct SparseMatrix {
15    /// Shape of the matrix (rows, cols)
16    pub shape: (usize, usize),
17    /// Row indices of non-zero values
18    pub row_indices: Vec<usize>,
19    /// Column indices of non-zero values
20    pub col_indices: Vec<usize>,
21    /// Non-zero values
22    pub values: Vec<f64>,
23}
24
25impl SparseMatrix {
26    /// Create a new empty sparse matrix
27    pub fn new(shape: (usize, usize)) -> Self {
28        SparseMatrix {
29            shape,
30            row_indices: Vec::new(),
31            col_indices: Vec::new(),
32            values: Vec::new(),
33        }
34    }
35
36    /// Add a non-zero value at (row, col)
37    pub fn push(&mut self, row: usize, col: usize, value: f64) {
38        if row < self.shape.0 && col < self.shape.1 && value != 0.0 {
39            self.row_indices.push(row);
40            self.col_indices.push(col);
41            self.values.push(value);
42        }
43    }
44
45    /// Convert to dense Array2
46    pub fn to_dense(&self) -> Array2<f64> {
47        let mut dense = Array2::zeros(self.shape);
48        for ((&row, &col), &val) in self
49            .row_indices
50            .iter()
51            .zip(self.col_indices.iter())
52            .zip(self.values.iter())
53        {
54            dense[[row, col]] = val;
55        }
56        dense
57    }
58
59    /// Get number of non-zero elements
60    pub fn nnz(&self) -> usize {
61        self.values.len()
62    }
63}
64
65/// Output format for encoded data
66#[derive(Debug, Clone)]
67pub enum EncodedOutput {
68    /// Dense matrix representation
69    Dense(Array2<f64>),
70    /// Sparse matrix representation
71    Sparse(SparseMatrix),
72}
73
74impl EncodedOutput {
75    /// Convert to dense matrix (creates copy if sparse)
76    pub fn to_dense(&self) -> Array2<f64> {
77        match self {
78            EncodedOutput::Dense(arr) => arr.clone(),
79            EncodedOutput::Sparse(sparse) => sparse.to_dense(),
80        }
81    }
82
83    /// Get shape of the output
84    pub fn shape(&self) -> (usize, usize) {
85        match self {
86            EncodedOutput::Dense(arr) => (arr.nrows(), arr.ncols()),
87            EncodedOutput::Sparse(sparse) => sparse.shape,
88        }
89    }
90}
91
92/// OneHotEncoder for converting categorical features to binary features
93///
94/// This transformer converts categorical features into a one-hot encoded representation,
95/// where each category is represented by a binary feature.
96pub struct OneHotEncoder {
97    /// Categories for each feature (learned during fit)
98    categories_: Option<Vec<Vec<u64>>>,
99    /// Whether to drop one category per feature to avoid collinearity
100    drop: Option<String>,
101    /// Whether to handle unknown categories
102    handleunknown: String,
103    /// Whether to return sparse matrix output
104    sparse: bool,
105}
106
107impl OneHotEncoder {
108    /// Creates a new OneHotEncoder
109    ///
110    /// # Arguments
111    /// * `drop` - Strategy for dropping categories ('first', 'if_binary', or None)
112    /// * `handleunknown` - How to handle unknown categories ('error' or 'ignore')
113    /// * `sparse` - Whether to return sparse arrays
114    ///
115    /// # Returns
116    /// * A new OneHotEncoder instance
117    pub fn new(_drop: Option<String>, handleunknown: &str, sparse: bool) -> Result<Self> {
118        if let Some(ref drop_strategy) = _drop {
119            if drop_strategy != "first" && drop_strategy != "if_binary" {
120                return Err(TransformError::InvalidInput(
121                    "_drop must be 'first', 'if_binary', or None".to_string(),
122                ));
123            }
124        }
125
126        if handleunknown != "error" && handleunknown != "ignore" {
127            return Err(TransformError::InvalidInput(
128                "handleunknown must be 'error' or 'ignore'".to_string(),
129            ));
130        }
131
132        Ok(OneHotEncoder {
133            categories_: None,
134            drop: _drop,
135            handleunknown: handleunknown.to_string(),
136            sparse,
137        })
138    }
139
140    /// Creates a OneHotEncoder with default settings
141    pub fn with_defaults() -> Self {
142        Self::new(None, "error", false).expect("Operation failed")
143    }
144
145    /// Fits the OneHotEncoder to the input data
146    ///
147    /// # Arguments
148    /// * `x` - The input categorical data, shape (n_samples, n_features)
149    ///
150    /// # Returns
151    /// * `Result<()>` - Ok if successful, Err otherwise
152    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
153    where
154        S: Data,
155        S::Elem: Float + NumCast,
156    {
157        let x_u64 = x.mapv(|x| {
158            let val_f64 = NumCast::from(x).unwrap_or(0.0);
159            val_f64 as u64
160        });
161
162        let n_samples = x_u64.shape()[0];
163        let n_features = x_u64.shape()[1];
164
165        if n_samples == 0 || n_features == 0 {
166            return Err(TransformError::InvalidInput("Empty input data".to_string()));
167        }
168
169        let mut categories = Vec::with_capacity(n_features);
170
171        for j in 0..n_features {
172            // Collect unique values for this feature
173            let mut unique_values: Vec<u64> = x_u64.column(j).to_vec();
174            unique_values.sort_unstable();
175            unique_values.dedup();
176
177            categories.push(unique_values);
178        }
179
180        self.categories_ = Some(categories);
181        Ok(())
182    }
183
184    /// Transforms the input data using the fitted OneHotEncoder
185    ///
186    /// # Arguments
187    /// * `x` - The input categorical data, shape (n_samples, n_features)
188    ///
189    /// # Returns
190    /// * `Result<EncodedOutput>` - The one-hot encoded data (dense or sparse)
191    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<EncodedOutput>
192    where
193        S: Data,
194        S::Elem: Float + NumCast,
195    {
196        let x_u64 = x.mapv(|x| {
197            let val_f64 = NumCast::from(x).unwrap_or(0.0);
198            val_f64 as u64
199        });
200
201        let n_samples = x_u64.shape()[0];
202        let n_features = x_u64.shape()[1];
203
204        if self.categories_.is_none() {
205            return Err(TransformError::TransformationError(
206                "OneHotEncoder has not been fitted".to_string(),
207            ));
208        }
209
210        let categories = self.categories_.as_ref().expect("Operation failed");
211
212        if n_features != categories.len() {
213            return Err(TransformError::InvalidInput(format!(
214                "x has {} features, but OneHotEncoder was fitted with {} features",
215                n_features,
216                categories.len()
217            )));
218        }
219
220        // Calculate total number of output features
221        let mut total_features = 0;
222        for (j, feature_categories) in categories.iter().enumerate() {
223            let n_cats = feature_categories.len();
224
225            // Apply drop strategy
226            let n_output_cats = match &self.drop {
227                Some(strategy) if strategy == "first" => n_cats.saturating_sub(1),
228                Some(strategy) if strategy == "if_binary" && n_cats == 2 => 1,
229                _ => n_cats,
230            };
231
232            if n_output_cats == 0 {
233                return Err(TransformError::InvalidInput(format!(
234                    "Feature {j} has only one category after dropping"
235                )));
236            }
237
238            total_features += n_output_cats;
239        }
240
241        // Create mappings from category values to column indices
242        let mut category_mappings = Vec::new();
243        let mut current_col = 0;
244
245        for feature_categories in categories.iter() {
246            let mut mapping = HashMap::new();
247            let n_cats = feature_categories.len();
248
249            // Determine how many categories to keep
250            let (start_idx, n_output_cats) = match &self.drop {
251                Some(strategy) if strategy == "first" => (1, n_cats.saturating_sub(1)),
252                Some(strategy) if strategy == "if_binary" && n_cats == 2 => (0, 1),
253                _ => (0, n_cats),
254            };
255
256            for (cat_idx, &category) in feature_categories.iter().enumerate() {
257                if cat_idx >= start_idx && cat_idx < start_idx + n_output_cats {
258                    mapping.insert(category, current_col + cat_idx - start_idx);
259                }
260            }
261
262            category_mappings.push(mapping);
263            current_col += n_output_cats;
264        }
265
266        // Create output based on sparse setting
267        if self.sparse {
268            // Sparse output
269            let mut sparse_matrix = SparseMatrix::new((n_samples, total_features));
270
271            for i in 0..n_samples {
272                for j in 0..n_features {
273                    let value = x_u64[[i, j]];
274
275                    if let Some(&col_idx) = category_mappings[j].get(&value) {
276                        sparse_matrix.push(i, col_idx, 1.0);
277                    } else {
278                        // Check if this is a dropped category
279                        let feature_categories = &categories[j];
280                        let is_dropped_category = match &self.drop {
281                            Some(strategy) if strategy == "first" => {
282                                !feature_categories.is_empty() && value == feature_categories[0]
283                            }
284                            Some(strategy)
285                                if strategy == "if_binary" && feature_categories.len() == 2 =>
286                            {
287                                feature_categories.len() == 2 && value == feature_categories[1]
288                            }
289                            _ => false,
290                        };
291
292                        if !is_dropped_category && self.handleunknown == "error" {
293                            return Err(TransformError::InvalidInput(format!(
294                                "Found unknown category {value} in feature {j}"
295                            )));
296                        }
297                        // If it's a dropped category or handleunknown == "ignore", we don't add anything (sparse)
298                    }
299                }
300            }
301
302            Ok(EncodedOutput::Sparse(sparse_matrix))
303        } else {
304            // Dense output
305            let mut transformed = Array2::zeros((n_samples, total_features));
306
307            for i in 0..n_samples {
308                for j in 0..n_features {
309                    let value = x_u64[[i, j]];
310
311                    if let Some(&col_idx) = category_mappings[j].get(&value) {
312                        transformed[[i, col_idx]] = 1.0;
313                    } else {
314                        // Check if this is a dropped category
315                        let feature_categories = &categories[j];
316                        let is_dropped_category = match &self.drop {
317                            Some(strategy) if strategy == "first" => {
318                                !feature_categories.is_empty() && value == feature_categories[0]
319                            }
320                            Some(strategy)
321                                if strategy == "if_binary" && feature_categories.len() == 2 =>
322                            {
323                                feature_categories.len() == 2 && value == feature_categories[1]
324                            }
325                            _ => false,
326                        };
327
328                        if !is_dropped_category && self.handleunknown == "error" {
329                            return Err(TransformError::InvalidInput(format!(
330                                "Found unknown category {value} in feature {j}"
331                            )));
332                        }
333                        // If it's a dropped category or handleunknown == "ignore", we just leave it as 0
334                    }
335                }
336            }
337
338            Ok(EncodedOutput::Dense(transformed))
339        }
340    }
341
342    /// Fits the OneHotEncoder to the input data and transforms it
343    ///
344    /// # Arguments
345    /// * `x` - The input categorical data, shape (n_samples, n_features)
346    ///
347    /// # Returns
348    /// * `Result<EncodedOutput>` - The one-hot encoded data (dense or sparse)
349    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<EncodedOutput>
350    where
351        S: Data,
352        S::Elem: Float + NumCast,
353    {
354        self.fit(x)?;
355        self.transform(x)
356    }
357
358    /// Convenience method that always returns dense output for backward compatibility
359    ///
360    /// # Arguments
361    /// * `x` - The input categorical data, shape (n_samples, n_features)
362    ///
363    /// # Returns
364    /// * `Result<Array2<f64>>` - The one-hot encoded data as dense matrix
365    pub fn transform_dense<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
366    where
367        S: Data,
368        S::Elem: Float + NumCast,
369    {
370        Ok(self.transform(x)?.to_dense())
371    }
372
373    /// Convenience method that fits and transforms returning dense output
374    ///
375    /// # Arguments
376    /// * `x` - The input categorical data, shape (n_samples, n_features)
377    ///
378    /// # Returns
379    /// * `Result<Array2<f64>>` - The one-hot encoded data as dense matrix
380    pub fn fit_transform_dense<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
381    where
382        S: Data,
383        S::Elem: Float + NumCast,
384    {
385        Ok(self.fit_transform(x)?.to_dense())
386    }
387
388    /// Returns the categories for each feature
389    ///
390    /// # Returns
391    /// * `Option<&Vec<Vec<u64>>>` - The categories for each feature
392    pub fn categories(&self) -> Option<&Vec<Vec<u64>>> {
393        self.categories_.as_ref()
394    }
395
396    /// Gets the feature names for the transformed output
397    ///
398    /// # Arguments
399    /// * `inputfeatures` - Names of input features
400    ///
401    /// # Returns
402    /// * `Result<Vec<String>>` - Names of output features
403    pub fn get_feature_names(&self, inputfeatures: Option<&[String]>) -> Result<Vec<String>> {
404        if self.categories_.is_none() {
405            return Err(TransformError::TransformationError(
406                "OneHotEncoder has not been fitted".to_string(),
407            ));
408        }
409
410        let categories = self.categories_.as_ref().expect("Operation failed");
411        let mut feature_names = Vec::new();
412
413        for (j, feature_categories) in categories.iter().enumerate() {
414            let feature_name = if let Some(names) = inputfeatures {
415                if j < names.len() {
416                    names[j].clone()
417                } else {
418                    format!("x{j}")
419                }
420            } else {
421                format!("x{j}")
422            };
423
424            let n_cats = feature_categories.len();
425
426            // Determine which categories to include based on drop strategy
427            let (start_idx, n_output_cats) = match &self.drop {
428                Some(strategy) if strategy == "first" => (1, n_cats.saturating_sub(1)),
429                Some(strategy) if strategy == "if_binary" && n_cats == 2 => (0, 1),
430                _ => (0, n_cats),
431            };
432
433            for &category in feature_categories
434                .iter()
435                .skip(start_idx)
436                .take(n_output_cats)
437            {
438                feature_names.push(format!("{feature_name}_cat_{category}"));
439            }
440        }
441
442        Ok(feature_names)
443    }
444}
445
446/// OrdinalEncoder for converting categorical features to ordinal integers
447///
448/// This transformer converts categorical features into ordinal integers,
449/// where each category is assigned a unique integer.
450pub struct OrdinalEncoder {
451    /// Categories for each feature (learned during fit)
452    categories_: Option<Vec<Vec<u64>>>,
453    /// How to handle unknown categories
454    handleunknown: String,
455    /// Value to use for unknown categories
456    unknownvalue: Option<f64>,
457}
458
459impl OrdinalEncoder {
460    /// Creates a new OrdinalEncoder
461    ///
462    /// # Arguments
463    /// * `handleunknown` - How to handle unknown categories ('error' or 'use_encoded_value')
464    /// * `unknownvalue` - Value to use for unknown categories (when handleunknown='use_encoded_value')
465    ///
466    /// # Returns
467    /// * A new OrdinalEncoder instance
468    pub fn new(handleunknown: &str, unknownvalue: Option<f64>) -> Result<Self> {
469        if handleunknown != "error" && handleunknown != "use_encoded_value" {
470            return Err(TransformError::InvalidInput(
471                "handleunknown must be 'error' or 'use_encoded_value'".to_string(),
472            ));
473        }
474
475        if handleunknown == "use_encoded_value" && unknownvalue.is_none() {
476            return Err(TransformError::InvalidInput(
477                "unknownvalue must be specified when handleunknown='use_encoded_value'".to_string(),
478            ));
479        }
480
481        Ok(OrdinalEncoder {
482            categories_: None,
483            handleunknown: handleunknown.to_string(),
484            unknownvalue,
485        })
486    }
487
488    /// Creates an OrdinalEncoder with default settings
489    pub fn with_defaults() -> Self {
490        Self::new("error", None).expect("Operation failed")
491    }
492
493    /// Fits the OrdinalEncoder to the input data
494    ///
495    /// # Arguments
496    /// * `x` - The input categorical data, shape (n_samples, n_features)
497    ///
498    /// # Returns
499    /// * `Result<()>` - Ok if successful, Err otherwise
500    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
501    where
502        S: Data,
503        S::Elem: Float + NumCast,
504    {
505        let x_u64 = x.mapv(|x| {
506            let val_f64 = NumCast::from(x).unwrap_or(0.0);
507            val_f64 as u64
508        });
509
510        let n_samples = x_u64.shape()[0];
511        let n_features = x_u64.shape()[1];
512
513        if n_samples == 0 || n_features == 0 {
514            return Err(TransformError::InvalidInput("Empty input data".to_string()));
515        }
516
517        let mut categories = Vec::with_capacity(n_features);
518
519        for j in 0..n_features {
520            // Collect unique values for this feature
521            let mut unique_values: Vec<u64> = x_u64.column(j).to_vec();
522            unique_values.sort_unstable();
523            unique_values.dedup();
524
525            categories.push(unique_values);
526        }
527
528        self.categories_ = Some(categories);
529        Ok(())
530    }
531
532    /// Transforms the input data using the fitted OrdinalEncoder
533    ///
534    /// # Arguments
535    /// * `x` - The input categorical data, shape (n_samples, n_features)
536    ///
537    /// # Returns
538    /// * `Result<Array2<f64>>` - The ordinally encoded data
539    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
540    where
541        S: Data,
542        S::Elem: Float + NumCast,
543    {
544        let x_u64 = x.mapv(|x| {
545            let val_f64 = NumCast::from(x).unwrap_or(0.0);
546            val_f64 as u64
547        });
548
549        let n_samples = x_u64.shape()[0];
550        let n_features = x_u64.shape()[1];
551
552        if self.categories_.is_none() {
553            return Err(TransformError::TransformationError(
554                "OrdinalEncoder has not been fitted".to_string(),
555            ));
556        }
557
558        let categories = self.categories_.as_ref().expect("Operation failed");
559
560        if n_features != categories.len() {
561            return Err(TransformError::InvalidInput(format!(
562                "x has {} features, but OrdinalEncoder was fitted with {} features",
563                n_features,
564                categories.len()
565            )));
566        }
567
568        let mut transformed = Array2::zeros((n_samples, n_features));
569
570        // Create mappings from category values to ordinal values
571        let mut category_mappings = Vec::new();
572        for feature_categories in categories {
573            let mut mapping = HashMap::new();
574            for (ordinal, &category) in feature_categories.iter().enumerate() {
575                mapping.insert(category, ordinal as f64);
576            }
577            category_mappings.push(mapping);
578        }
579
580        // Fill the transformed array
581        for i in 0..n_samples {
582            for j in 0..n_features {
583                let value = x_u64[[i, j]];
584
585                if let Some(&ordinal_value) = category_mappings[j].get(&value) {
586                    transformed[[i, j]] = ordinal_value;
587                } else if self.handleunknown == "error" {
588                    return Err(TransformError::InvalidInput(format!(
589                        "Found unknown category {value} in feature {j}"
590                    )));
591                } else {
592                    // handleunknown == "use_encoded_value"
593                    transformed[[i, j]] = self.unknownvalue.expect("Operation failed");
594                }
595            }
596        }
597
598        Ok(transformed)
599    }
600
601    /// Fits the OrdinalEncoder to the input data and transforms it
602    ///
603    /// # Arguments
604    /// * `x` - The input categorical data, shape (n_samples, n_features)
605    ///
606    /// # Returns
607    /// * `Result<Array2<f64>>` - The ordinally encoded data
608    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
609    where
610        S: Data,
611        S::Elem: Float + NumCast,
612    {
613        self.fit(x)?;
614        self.transform(x)
615    }
616
617    /// Returns the categories for each feature
618    ///
619    /// # Returns
620    /// * `Option<&Vec<Vec<u64>>>` - The categories for each feature
621    pub fn categories(&self) -> Option<&Vec<Vec<u64>>> {
622        self.categories_.as_ref()
623    }
624}
625
626/// TargetEncoder for supervised categorical encoding
627///
628/// This encoder transforms categorical features using the target variable values,
629/// encoding each category with a statistic (mean, median, etc.) of the target values
630/// for that category. This is useful for high-cardinality categorical features.
631///
632/// # Examples
633/// ```
634/// use scirs2_core::ndarray::Array2;
635/// use scirs2_transform::encoding::TargetEncoder;
636///
637/// let x = Array2::from_shape_vec((6, 1), vec![0.0, 1.0, 2.0, 0.0, 1.0, 2.0]).expect("Operation failed");
638/// let y = vec![1.0, 2.0, 3.0, 1.5, 2.5, 3.5];
639///
640/// let mut encoder = TargetEncoder::new("mean", 1.0, 0.0).expect("Operation failed");
641/// let encoded = encoder.fit_transform(&x, &y).expect("Operation failed");
642/// ```
643#[derive(Debug, Clone)]
644pub struct TargetEncoder {
645    /// Encoding strategy ('mean', 'median', 'count', 'sum')
646    strategy: String,
647    /// Smoothing parameter for regularization (higher = more smoothing toward global mean)
648    smoothing: f64,
649    /// Global statistic to use for smoothing and unknown categories
650    globalstat: f64,
651    /// Mappings from categories to encoded values for each feature
652    encodings_: Option<Vec<HashMap<u64, f64>>>,
653    /// Whether the encoder has been fitted
654    is_fitted: bool,
655    /// Global mean of target values (computed during fit)
656    global_mean_: f64,
657}
658
659impl TargetEncoder {
660    /// Creates a new TargetEncoder
661    ///
662    /// # Arguments
663    /// * `strategy` - Encoding strategy ('mean', 'median', 'count', 'sum')
664    /// * `smoothing` - Smoothing parameter (0.0 = no smoothing, higher = more smoothing)
665    /// * `globalstat` - Global statistic fallback for unknown categories
666    ///
667    /// # Returns
668    /// * A new TargetEncoder instance
669    pub fn new(_strategy: &str, smoothing: f64, globalstat: f64) -> Result<Self> {
670        if !["mean", "median", "count", "sum"].contains(&_strategy) {
671            return Err(TransformError::InvalidInput(
672                "_strategy must be 'mean', 'median', 'count', or 'sum'".to_string(),
673            ));
674        }
675
676        if smoothing < 0.0 {
677            return Err(TransformError::InvalidInput(
678                "smoothing parameter must be non-negative".to_string(),
679            ));
680        }
681
682        Ok(TargetEncoder {
683            strategy: _strategy.to_string(),
684            smoothing,
685            globalstat,
686            encodings_: None,
687            is_fitted: false,
688            global_mean_: 0.0,
689        })
690    }
691
692    /// Creates a TargetEncoder with mean strategy and default smoothing
693    pub fn with_mean(smoothing: f64) -> Self {
694        TargetEncoder {
695            strategy: "mean".to_string(),
696            smoothing,
697            globalstat: 0.0,
698            encodings_: None,
699            is_fitted: false,
700            global_mean_: 0.0,
701        }
702    }
703
704    /// Creates a TargetEncoder with median strategy
705    pub fn with_median(smoothing: f64) -> Self {
706        TargetEncoder {
707            strategy: "median".to_string(),
708            smoothing,
709            globalstat: 0.0,
710            encodings_: None,
711            is_fitted: false,
712            global_mean_: 0.0,
713        }
714    }
715
716    /// Fits the TargetEncoder to the input data and target values
717    ///
718    /// # Arguments
719    /// * `x` - The input categorical data, shape (n_samples, n_features)
720    /// * `y` - The target values, length n_samples
721    ///
722    /// # Returns
723    /// * `Result<()>` - Ok if successful, Err otherwise
724    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>, y: &[f64]) -> Result<()>
725    where
726        S: Data,
727        S::Elem: Float + NumCast,
728    {
729        let x_u64 = x.mapv(|x| {
730            let val_f64 = NumCast::from(x).unwrap_or(0.0);
731            val_f64 as u64
732        });
733
734        let n_samples = x_u64.shape()[0];
735        let n_features = x_u64.shape()[1];
736
737        if n_samples == 0 || n_features == 0 {
738            return Err(TransformError::InvalidInput("Empty input data".to_string()));
739        }
740
741        if y.len() != n_samples {
742            return Err(TransformError::InvalidInput(
743                "Number of target values must match number of samples".to_string(),
744            ));
745        }
746
747        // Compute global mean for smoothing
748        self.global_mean_ = y.iter().sum::<f64>() / y.len() as f64;
749
750        let mut encodings = Vec::with_capacity(n_features);
751
752        for j in 0..n_features {
753            // Group target values by category for this feature
754            let mut category_targets: HashMap<u64, Vec<f64>> = HashMap::new();
755
756            for i in 0..n_samples {
757                let category = x_u64[[i, j]];
758                category_targets.entry(category).or_default().push(y[i]);
759            }
760
761            // Compute encoding for each category
762            let mut category_encoding = HashMap::new();
763
764            for (category, targets) in category_targets.iter() {
765                let encoded_value = match self.strategy.as_str() {
766                    "mean" => {
767                        let category_mean = targets.iter().sum::<f64>() / targets.len() as f64;
768                        let count = targets.len() as f64;
769
770                        // Apply smoothing: (count * category_mean + smoothing * global_mean) / (count + smoothing)
771                        if self.smoothing > 0.0 {
772                            (count * category_mean + self.smoothing * self.global_mean_)
773                                / (count + self.smoothing)
774                        } else {
775                            category_mean
776                        }
777                    }
778                    "median" => {
779                        let mut sorted_targets = targets.clone();
780                        sorted_targets.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
781
782                        let median = if sorted_targets.len() % 2 == 0 {
783                            let mid = sorted_targets.len() / 2;
784                            (sorted_targets[mid - 1] + sorted_targets[mid]) / 2.0
785                        } else {
786                            sorted_targets[sorted_targets.len() / 2]
787                        };
788
789                        // Apply smoothing toward global mean
790                        if self.smoothing > 0.0 {
791                            let count = targets.len() as f64;
792                            (count * median + self.smoothing * self.global_mean_)
793                                / (count + self.smoothing)
794                        } else {
795                            median
796                        }
797                    }
798                    "count" => targets.len() as f64,
799                    "sum" => targets.iter().sum::<f64>(),
800                    _ => unreachable!(),
801                };
802
803                category_encoding.insert(*category, encoded_value);
804            }
805
806            encodings.push(category_encoding);
807        }
808
809        self.encodings_ = Some(encodings);
810        self.is_fitted = true;
811        Ok(())
812    }
813
814    /// Transforms the input data using the fitted TargetEncoder
815    ///
816    /// # Arguments
817    /// * `x` - The input categorical data, shape (n_samples, n_features)
818    ///
819    /// # Returns
820    /// * `Result<Array2<f64>>` - The target-encoded data
821    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
822    where
823        S: Data,
824        S::Elem: Float + NumCast,
825    {
826        if !self.is_fitted {
827            return Err(TransformError::TransformationError(
828                "TargetEncoder has not been fitted".to_string(),
829            ));
830        }
831
832        let x_u64 = x.mapv(|x| {
833            let val_f64 = NumCast::from(x).unwrap_or(0.0);
834            val_f64 as u64
835        });
836
837        let n_samples = x_u64.shape()[0];
838        let n_features = x_u64.shape()[1];
839
840        let encodings = self.encodings_.as_ref().expect("Operation failed");
841
842        if n_features != encodings.len() {
843            return Err(TransformError::InvalidInput(format!(
844                "x has {} features, but TargetEncoder was fitted with {} features",
845                n_features,
846                encodings.len()
847            )));
848        }
849
850        let mut transformed = Array2::zeros((n_samples, n_features));
851
852        for i in 0..n_samples {
853            for j in 0..n_features {
854                let category = x_u64[[i, j]];
855
856                if let Some(&encoded_value) = encodings[j].get(&category) {
857                    transformed[[i, j]] = encoded_value;
858                } else {
859                    // Use global statistic for unknown categories
860                    transformed[[i, j]] = if self.globalstat != 0.0 {
861                        self.globalstat
862                    } else {
863                        self.global_mean_
864                    };
865                }
866            }
867        }
868
869        Ok(transformed)
870    }
871
872    /// Fits the TargetEncoder and transforms the data in one step
873    ///
874    /// # Arguments
875    /// * `x` - The input categorical data, shape (n_samples, n_features)
876    /// * `y` - The target values, length n_samples
877    ///
878    /// # Returns
879    /// * `Result<Array2<f64>>` - The target-encoded data
880    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>, y: &[f64]) -> Result<Array2<f64>>
881    where
882        S: Data,
883        S::Elem: Float + NumCast,
884    {
885        self.fit(x, y)?;
886        self.transform(x)
887    }
888
889    /// Returns the learned encodings for each feature
890    ///
891    /// # Returns
892    /// * `Option<&Vec<HashMap<u64, f64>>>` - The category encodings for each feature
893    pub fn encodings(&self) -> Option<&Vec<HashMap<u64, f64>>> {
894        self.encodings_.as_ref()
895    }
896
897    /// Returns whether the encoder has been fitted
898    pub fn is_fitted(&self) -> bool {
899        self.is_fitted
900    }
901
902    /// Returns the global mean computed during fitting
903    pub fn global_mean(&self) -> f64 {
904        self.global_mean_
905    }
906
907    /// Applies cross-validation target encoding to prevent overfitting
908    ///
909    /// This method performs k-fold cross-validation to compute target encodings,
910    /// which helps prevent overfitting when the same data is used for both
911    /// fitting and transforming.
912    ///
913    /// # Arguments
914    /// * `x` - The input categorical data, shape (n_samples, n_features)
915    /// * `y` - The target values, length n_samples
916    /// * `cv_folds` - Number of cross-validation folds (default: 5)
917    ///
918    /// # Returns
919    /// * `Result<Array2<f64>>` - The cross-validated target-encoded data
920    pub fn fit_transform_cv<S>(
921        &mut self,
922        x: &ArrayBase<S, Ix2>,
923        y: &[f64],
924        cv_folds: usize,
925    ) -> Result<Array2<f64>>
926    where
927        S: Data,
928        S::Elem: Float + NumCast,
929    {
930        let x_u64 = x.mapv(|x| {
931            let val_f64 = NumCast::from(x).unwrap_or(0.0);
932            val_f64 as u64
933        });
934
935        let n_samples = x_u64.shape()[0];
936        let n_features = x_u64.shape()[1];
937
938        if n_samples == 0 || n_features == 0 {
939            return Err(TransformError::InvalidInput("Empty input data".to_string()));
940        }
941
942        if y.len() != n_samples {
943            return Err(TransformError::InvalidInput(
944                "Number of target values must match number of samples".to_string(),
945            ));
946        }
947
948        if cv_folds < 2 {
949            return Err(TransformError::InvalidInput(
950                "cv_folds must be at least 2".to_string(),
951            ));
952        }
953
954        let mut transformed = Array2::zeros((n_samples, n_features));
955
956        // Compute global mean
957        self.global_mean_ = y.iter().sum::<f64>() / y.len() as f64;
958
959        // Create fold indices
960        let fold_size = n_samples / cv_folds;
961        let mut fold_indices = Vec::new();
962        for fold in 0..cv_folds {
963            let start = fold * fold_size;
964            let end = if fold == cv_folds - 1 {
965                n_samples
966            } else {
967                (fold + 1) * fold_size
968            };
969            fold_indices.push((start, end));
970        }
971
972        // For each fold, train on other _folds and predict on this fold
973        for fold in 0..cv_folds {
974            let (val_start, val_end) = fold_indices[fold];
975
976            // Collect training data (all _folds except current)
977            let mut train_indices = Vec::new();
978            for (other_fold, &(start, end)) in fold_indices.iter().enumerate().take(cv_folds) {
979                if other_fold != fold {
980                    train_indices.extend(start..end);
981                }
982            }
983
984            // For each feature, compute encodings on training data
985            for j in 0..n_features {
986                let mut category_targets: HashMap<u64, Vec<f64>> = HashMap::new();
987
988                // Collect target values by category for training data
989                for &train_idx in &train_indices {
990                    let category = x_u64[[train_idx, j]];
991                    category_targets
992                        .entry(category)
993                        .or_default()
994                        .push(y[train_idx]);
995                }
996
997                // Compute encodings for this fold
998                let mut category_encoding = HashMap::new();
999                for (category, targets) in category_targets.iter() {
1000                    let encoded_value = match self.strategy.as_str() {
1001                        "mean" => {
1002                            let category_mean = targets.iter().sum::<f64>() / targets.len() as f64;
1003                            let count = targets.len() as f64;
1004
1005                            if self.smoothing > 0.0 {
1006                                (count * category_mean + self.smoothing * self.global_mean_)
1007                                    / (count + self.smoothing)
1008                            } else {
1009                                category_mean
1010                            }
1011                        }
1012                        "median" => {
1013                            let mut sorted_targets = targets.clone();
1014                            sorted_targets
1015                                .sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
1016
1017                            let median = if sorted_targets.len() % 2 == 0 {
1018                                let mid = sorted_targets.len() / 2;
1019                                (sorted_targets[mid - 1] + sorted_targets[mid]) / 2.0
1020                            } else {
1021                                sorted_targets[sorted_targets.len() / 2]
1022                            };
1023
1024                            if self.smoothing > 0.0 {
1025                                let count = targets.len() as f64;
1026                                (count * median + self.smoothing * self.global_mean_)
1027                                    / (count + self.smoothing)
1028                            } else {
1029                                median
1030                            }
1031                        }
1032                        "count" => targets.len() as f64,
1033                        "sum" => targets.iter().sum::<f64>(),
1034                        _ => unreachable!(),
1035                    };
1036
1037                    category_encoding.insert(*category, encoded_value);
1038                }
1039
1040                // Apply encodings to validation fold
1041                for val_idx in val_start..val_end {
1042                    let category = x_u64[[val_idx, j]];
1043
1044                    if let Some(&encoded_value) = category_encoding.get(&category) {
1045                        transformed[[val_idx, j]] = encoded_value;
1046                    } else {
1047                        // Use global mean for unknown categories
1048                        transformed[[val_idx, j]] = self.global_mean_;
1049                    }
1050                }
1051            }
1052        }
1053
1054        // Now fit on the full data for future transforms
1055        self.fit(x, y)?;
1056
1057        Ok(transformed)
1058    }
1059}
1060
1061/// BinaryEncoder for converting categorical features to binary representations
1062///
1063/// This transformer converts categorical features into binary representations,
1064/// where each category is encoded as a unique binary number. This is more
1065/// memory-efficient than one-hot encoding for high-cardinality categorical features.
1066///
1067/// For n unique categories, ceil(log2(n)) binary features are created.
1068#[derive(Debug, Clone)]
1069pub struct BinaryEncoder {
1070    /// Mappings from categories to binary codes for each feature
1071    categories_: Option<Vec<HashMap<u64, Vec<u8>>>>,
1072    /// Number of binary features per original feature
1073    n_binary_features_: Option<Vec<usize>>,
1074    /// Whether to handle unknown categories
1075    handleunknown: String,
1076    /// Whether the encoder has been fitted
1077    is_fitted: bool,
1078}
1079
1080impl BinaryEncoder {
1081    /// Creates a new BinaryEncoder
1082    ///
1083    /// # Arguments
1084    /// * `handleunknown` - How to handle unknown categories ('error' or 'ignore')
1085    ///   - 'error': Raise an error if unknown categories are encountered
1086    ///   - 'ignore': Encode unknown categories as all zeros
1087    ///
1088    /// # Returns
1089    /// * `Result<BinaryEncoder>` - The new encoder instance
1090    pub fn new(handleunknown: &str) -> Result<Self> {
1091        if handleunknown != "error" && handleunknown != "ignore" {
1092            return Err(TransformError::InvalidInput(
1093                "handleunknown must be 'error' or 'ignore'".to_string(),
1094            ));
1095        }
1096
1097        Ok(BinaryEncoder {
1098            categories_: None,
1099            n_binary_features_: None,
1100            handleunknown: handleunknown.to_string(),
1101            is_fitted: false,
1102        })
1103    }
1104
1105    /// Creates a BinaryEncoder with default settings (handleunknown='error')
1106    pub fn with_defaults() -> Self {
1107        Self::new("error").expect("Operation failed")
1108    }
1109
1110    /// Fits the BinaryEncoder to the input data
1111    ///
1112    /// # Arguments
1113    /// * `x` - The input categorical data, shape (n_samples, n_features)
1114    ///
1115    /// # Returns
1116    /// * `Result<()>` - Ok if successful, Err otherwise
1117    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
1118    where
1119        S: Data,
1120        S::Elem: Float + NumCast,
1121    {
1122        let x_u64 = x.mapv(|x| {
1123            let val_f64 = NumCast::from(x).unwrap_or(0.0);
1124            val_f64 as u64
1125        });
1126
1127        let n_samples = x_u64.shape()[0];
1128        let n_features = x_u64.shape()[1];
1129
1130        if n_samples == 0 || n_features == 0 {
1131            return Err(TransformError::InvalidInput("Empty input data".to_string()));
1132        }
1133
1134        let mut categories = Vec::with_capacity(n_features);
1135        let mut n_binary_features = Vec::with_capacity(n_features);
1136
1137        for j in 0..n_features {
1138            // Collect unique categories for this feature
1139            let mut unique_categories: Vec<u64> = x_u64.column(j).to_vec();
1140            unique_categories.sort_unstable();
1141            unique_categories.dedup();
1142
1143            if unique_categories.is_empty() {
1144                return Err(TransformError::InvalidInput(
1145                    "Feature has no valid categories".to_string(),
1146                ));
1147            }
1148
1149            // Calculate number of binary features needed
1150            let n_cats = unique_categories.len();
1151            let nbits = if n_cats <= 1 {
1152                1
1153            } else {
1154                (n_cats as f64).log2().ceil() as usize
1155            };
1156
1157            // Create binary mappings
1158            let mut category_map = HashMap::new();
1159            for (idx, &category) in unique_categories.iter().enumerate() {
1160                let binary_code = Self::int_to_binary(idx, nbits);
1161                category_map.insert(category, binary_code);
1162            }
1163
1164            categories.push(category_map);
1165            n_binary_features.push(nbits);
1166        }
1167
1168        self.categories_ = Some(categories);
1169        self.n_binary_features_ = Some(n_binary_features);
1170        self.is_fitted = true;
1171
1172        Ok(())
1173    }
1174
1175    /// Transforms the input data using the fitted encoder
1176    ///
1177    /// # Arguments
1178    /// * `x` - The input categorical data to transform
1179    ///
1180    /// # Returns
1181    /// * `Result<Array2<f64>>` - The binary-encoded data
1182    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1183    where
1184        S: Data,
1185        S::Elem: Float + NumCast,
1186    {
1187        if !self.is_fitted {
1188            return Err(TransformError::InvalidInput(
1189                "Encoder has not been fitted yet".to_string(),
1190            ));
1191        }
1192
1193        let categories = self.categories_.as_ref().expect("Operation failed");
1194        let n_binary_features = self.n_binary_features_.as_ref().expect("Operation failed");
1195
1196        let x_u64 = x.mapv(|x| {
1197            let val_f64 = NumCast::from(x).unwrap_or(0.0);
1198            val_f64 as u64
1199        });
1200
1201        let n_samples = x_u64.shape()[0];
1202        let n_features = x_u64.shape()[1];
1203
1204        if n_features != categories.len() {
1205            return Err(TransformError::InvalidInput(format!(
1206                "Number of features ({}) does not match fitted features ({})",
1207                n_features,
1208                categories.len()
1209            )));
1210        }
1211
1212        // Calculate total number of output features
1213        let total_binary_features: usize = n_binary_features.iter().sum();
1214        let mut result = Array2::<f64>::zeros((n_samples, total_binary_features));
1215
1216        let mut output_col = 0;
1217        for j in 0..n_features {
1218            let category_map = &categories[j];
1219            let nbits = n_binary_features[j];
1220
1221            for i in 0..n_samples {
1222                let category = x_u64[[i, j]];
1223
1224                if let Some(binary_code) = category_map.get(&category) {
1225                    // Known category: use binary code
1226                    for (bit_idx, &bit_val) in binary_code.iter().enumerate() {
1227                        result[[i, output_col + bit_idx]] = bit_val as f64;
1228                    }
1229                } else {
1230                    // Unknown category
1231                    match self.handleunknown.as_str() {
1232                        "error" => {
1233                            return Err(TransformError::InvalidInput(format!(
1234                                "Unknown category {category} in feature {j}"
1235                            )));
1236                        }
1237                        "ignore" => {
1238                            // Set all bits to zero (already initialized)
1239                        }
1240                        _ => unreachable!(),
1241                    }
1242                }
1243            }
1244
1245            output_col += nbits;
1246        }
1247
1248        Ok(result)
1249    }
1250
1251    /// Fits the encoder and transforms the data in one step
1252    ///
1253    /// # Arguments
1254    /// * `x` - The input categorical data
1255    ///
1256    /// # Returns
1257    /// * `Result<Array2<f64>>` - The binary-encoded data
1258    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1259    where
1260        S: Data,
1261        S::Elem: Float + NumCast,
1262    {
1263        self.fit(x)?;
1264        self.transform(x)
1265    }
1266
1267    /// Returns whether the encoder has been fitted
1268    pub fn is_fitted(&self) -> bool {
1269        self.is_fitted
1270    }
1271
1272    /// Returns the category mappings if fitted
1273    pub fn categories(&self) -> Option<&Vec<HashMap<u64, Vec<u8>>>> {
1274        self.categories_.as_ref()
1275    }
1276
1277    /// Returns the number of binary features per original feature
1278    pub fn n_binary_features(&self) -> Option<&Vec<usize>> {
1279        self.n_binary_features_.as_ref()
1280    }
1281
1282    /// Returns the total number of output features
1283    pub fn n_output_features(&self) -> Option<usize> {
1284        self.n_binary_features_.as_ref().map(|v| v.iter().sum())
1285    }
1286
1287    /// Converts an integer to binary representation
1288    fn int_to_binary(_value: usize, nbits: usize) -> Vec<u8> {
1289        let mut binary = Vec::with_capacity(nbits);
1290        let mut val = _value;
1291
1292        for _ in 0..nbits {
1293            binary.push((val & 1) as u8);
1294            val >>= 1;
1295        }
1296
1297        binary.reverse(); // Most significant bit first
1298        binary
1299    }
1300}
1301
1302/// FrequencyEncoder for converting categorical features to frequency counts
1303///
1304/// This transformer converts categorical features into their frequency of occurrence
1305/// in the training data. High-frequency categories get higher values, which can be
1306/// useful for models that can leverage frequency information.
1307#[derive(Debug, Clone)]
1308pub struct FrequencyEncoder {
1309    /// Frequency mappings for each feature
1310    frequency_maps_: Option<Vec<HashMap<u64, f64>>>,
1311    /// Whether to normalize frequencies to [0, 1]
1312    normalize: bool,
1313    /// How to handle unknown categories
1314    handleunknown: String,
1315    /// Value to use for unknown categories (when handleunknown="use_encoded_value")
1316    unknownvalue: f64,
1317    /// Whether the encoder has been fitted
1318    is_fitted: bool,
1319}
1320
1321impl FrequencyEncoder {
1322    /// Creates a new FrequencyEncoder
1323    ///
1324    /// # Arguments
1325    /// * `normalize` - Whether to normalize frequencies to [0, 1] range
1326    /// * `handleunknown` - How to handle unknown categories ('error', 'ignore', or 'use_encoded_value')
1327    /// * `unknownvalue` - Value to use for unknown categories (when handleunknown="use_encoded_value")
1328    ///
1329    /// # Returns
1330    /// * `Result<FrequencyEncoder>` - The new encoder instance
1331    pub fn new(normalize: bool, handleunknown: &str, unknownvalue: f64) -> Result<Self> {
1332        if !["error", "ignore", "use_encoded_value"].contains(&handleunknown) {
1333            return Err(TransformError::InvalidInput(
1334                "handleunknown must be 'error', 'ignore', or 'use_encoded_value'".to_string(),
1335            ));
1336        }
1337
1338        Ok(FrequencyEncoder {
1339            frequency_maps_: None,
1340            normalize,
1341            handleunknown: handleunknown.to_string(),
1342            unknownvalue,
1343            is_fitted: false,
1344        })
1345    }
1346
1347    /// Creates a FrequencyEncoder with default settings
1348    pub fn with_defaults() -> Self {
1349        Self::new(false, "error", 0.0).expect("Operation failed")
1350    }
1351
1352    /// Creates a FrequencyEncoder with normalized frequencies
1353    pub fn with_normalization() -> Self {
1354        Self::new(true, "error", 0.0).expect("Operation failed")
1355    }
1356
1357    /// Fits the FrequencyEncoder to the input data
1358    ///
1359    /// # Arguments
1360    /// * `x` - The input categorical data, shape (n_samples, n_features)
1361    ///
1362    /// # Returns
1363    /// * `Result<()>` - Ok if successful, Err otherwise
1364    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
1365    where
1366        S: Data,
1367        S::Elem: Float + NumCast,
1368    {
1369        let x_u64 = x.mapv(|x| {
1370            let val_f64 = NumCast::from(x).unwrap_or(0.0);
1371            val_f64 as u64
1372        });
1373
1374        let n_samples = x_u64.shape()[0];
1375        let n_features = x_u64.shape()[1];
1376
1377        if n_samples == 0 || n_features == 0 {
1378            return Err(TransformError::InvalidInput("Empty input data".to_string()));
1379        }
1380
1381        let mut frequency_maps = Vec::with_capacity(n_features);
1382
1383        for j in 0..n_features {
1384            // Count frequency of each category
1385            let mut category_counts: HashMap<u64, usize> = HashMap::new();
1386            for i in 0..n_samples {
1387                let category = x_u64[[i, j]];
1388                *category_counts.entry(category).or_insert(0) += 1;
1389            }
1390
1391            // Convert counts to frequencies
1392            let mut frequency_map = HashMap::new();
1393            for (category, count) in category_counts {
1394                let frequency = if self.normalize {
1395                    count as f64 / n_samples as f64
1396                } else {
1397                    count as f64
1398                };
1399                frequency_map.insert(category, frequency);
1400            }
1401
1402            frequency_maps.push(frequency_map);
1403        }
1404
1405        self.frequency_maps_ = Some(frequency_maps);
1406        self.is_fitted = true;
1407        Ok(())
1408    }
1409
1410    /// Transforms the input data using the fitted FrequencyEncoder
1411    ///
1412    /// # Arguments
1413    /// * `x` - The input categorical data, shape (n_samples, n_features)
1414    ///
1415    /// # Returns
1416    /// * `Result<Array2<f64>>` - The frequency-encoded data
1417    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1418    where
1419        S: Data,
1420        S::Elem: Float + NumCast,
1421    {
1422        if !self.is_fitted {
1423            return Err(TransformError::TransformationError(
1424                "FrequencyEncoder has not been fitted".to_string(),
1425            ));
1426        }
1427
1428        let frequency_maps = self.frequency_maps_.as_ref().expect("Operation failed");
1429
1430        let x_u64 = x.mapv(|x| {
1431            let val_f64 = NumCast::from(x).unwrap_or(0.0);
1432            val_f64 as u64
1433        });
1434
1435        let n_samples = x_u64.shape()[0];
1436        let n_features = x_u64.shape()[1];
1437
1438        if n_features != frequency_maps.len() {
1439            return Err(TransformError::InvalidInput(format!(
1440                "x has {} features, but FrequencyEncoder was fitted with {} features",
1441                n_features,
1442                frequency_maps.len()
1443            )));
1444        }
1445
1446        let mut transformed = Array2::zeros((n_samples, n_features));
1447
1448        for i in 0..n_samples {
1449            for j in 0..n_features {
1450                let category = x_u64[[i, j]];
1451
1452                if let Some(&frequency) = frequency_maps[j].get(&category) {
1453                    transformed[[i, j]] = frequency;
1454                } else {
1455                    // Handle unknown category
1456                    match self.handleunknown.as_str() {
1457                        "error" => {
1458                            return Err(TransformError::InvalidInput(format!(
1459                                "Unknown category {category} in feature {j}"
1460                            )));
1461                        }
1462                        "ignore" => {
1463                            transformed[[i, j]] = 0.0;
1464                        }
1465                        "use_encoded_value" => {
1466                            transformed[[i, j]] = self.unknownvalue;
1467                        }
1468                        _ => unreachable!(),
1469                    }
1470                }
1471            }
1472        }
1473
1474        Ok(transformed)
1475    }
1476
1477    /// Fits the encoder and transforms the data in one step
1478    ///
1479    /// # Arguments
1480    /// * `x` - The input categorical data
1481    ///
1482    /// # Returns
1483    /// * `Result<Array2<f64>>` - The frequency-encoded data
1484    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1485    where
1486        S: Data,
1487        S::Elem: Float + NumCast,
1488    {
1489        self.fit(x)?;
1490        self.transform(x)
1491    }
1492
1493    /// Returns whether the encoder has been fitted
1494    pub fn is_fitted(&self) -> bool {
1495        self.is_fitted
1496    }
1497
1498    /// Returns the learned frequency mappings if fitted
1499    pub fn frequency_maps(&self) -> Option<&Vec<HashMap<u64, f64>>> {
1500        self.frequency_maps_.as_ref()
1501    }
1502}
1503
1504/// Weight of Evidence (WOE) Encoder for converting categorical features using target information
1505///
1506/// WOE encoding transforms categorical features based on the relationship between
1507/// each category and a binary target variable. It's particularly useful for credit
1508/// scoring and other binary classification tasks.
1509///
1510/// WOE = ln(P(target=1|category) / P(target=0|category))
1511#[derive(Debug, Clone)]
1512pub struct WOEEncoder {
1513    /// WOE mappings for each feature
1514    woe_maps_: Option<Vec<HashMap<u64, f64>>>,
1515    /// Information Value (IV) for each feature
1516    information_values_: Option<Vec<f64>>,
1517    /// Regularization parameter to handle categories with zero events/non-events
1518    regularization: f64,
1519    /// How to handle unknown categories
1520    handleunknown: String,
1521    /// Value to use for unknown categories (when handleunknown="use_encoded_value")
1522    unknownvalue: f64,
1523    /// Global WOE value for unknown categories (computed as overall log-odds)
1524    global_woe_: f64,
1525    /// Whether the encoder has been fitted
1526    is_fitted: bool,
1527}
1528
1529impl WOEEncoder {
1530    /// Creates a new WOEEncoder
1531    ///
1532    /// # Arguments
1533    /// * `regularization` - Small value added to prevent division by zero (default: 0.5)
1534    /// * `handleunknown` - How to handle unknown categories ('error', 'global_woe', or 'use_encoded_value')
1535    /// * `unknownvalue` - Value to use for unknown categories (when handleunknown="use_encoded_value")
1536    ///
1537    /// # Returns
1538    /// * `Result<WOEEncoder>` - The new encoder instance
1539    pub fn new(regularization: f64, handleunknown: &str, unknownvalue: f64) -> Result<Self> {
1540        if regularization < 0.0 {
1541            return Err(TransformError::InvalidInput(
1542                "regularization must be non-negative".to_string(),
1543            ));
1544        }
1545
1546        if !["error", "global_woe", "use_encoded_value"].contains(&handleunknown) {
1547            return Err(TransformError::InvalidInput(
1548                "handleunknown must be 'error', 'global_woe', or 'use_encoded_value'".to_string(),
1549            ));
1550        }
1551
1552        Ok(WOEEncoder {
1553            woe_maps_: None,
1554            information_values_: None,
1555            regularization,
1556            handleunknown: handleunknown.to_string(),
1557            unknownvalue,
1558            global_woe_: 0.0,
1559            is_fitted: false,
1560        })
1561    }
1562
1563    /// Creates a WOEEncoder with default settings
1564    pub fn with_defaults() -> Self {
1565        Self::new(0.5, "global_woe", 0.0).expect("Operation failed")
1566    }
1567
1568    /// Creates a WOEEncoder with custom regularization
1569    pub fn with_regularization(regularization: f64) -> Result<Self> {
1570        Self::new(regularization, "global_woe", 0.0)
1571    }
1572
1573    /// Fits the WOEEncoder to the input data
1574    ///
1575    /// # Arguments
1576    /// * `x` - The input categorical data, shape (n_samples, n_features)
1577    /// * `y` - The binary target values (0 or 1), length n_samples
1578    ///
1579    /// # Returns
1580    /// * `Result<()>` - Ok if successful, Err otherwise
1581    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>, y: &[f64]) -> Result<()>
1582    where
1583        S: Data,
1584        S::Elem: Float + NumCast,
1585    {
1586        let x_u64 = x.mapv(|x| {
1587            let val_f64 = NumCast::from(x).unwrap_or(0.0);
1588            val_f64 as u64
1589        });
1590
1591        let n_samples = x_u64.shape()[0];
1592        let n_features = x_u64.shape()[1];
1593
1594        if n_samples == 0 || n_features == 0 {
1595            return Err(TransformError::InvalidInput("Empty input data".to_string()));
1596        }
1597
1598        if y.len() != n_samples {
1599            return Err(TransformError::InvalidInput(
1600                "Number of target values must match number of samples".to_string(),
1601            ));
1602        }
1603
1604        // Validate that target is binary
1605        for &target in y {
1606            if target != 0.0 && target != 1.0 {
1607                return Err(TransformError::InvalidInput(
1608                    "Target values must be binary (0 or 1)".to_string(),
1609                ));
1610            }
1611        }
1612
1613        // Calculate global statistics
1614        let total_events: f64 = y.iter().sum();
1615        let total_non_events = n_samples as f64 - total_events;
1616
1617        if total_events == 0.0 || total_non_events == 0.0 {
1618            return Err(TransformError::InvalidInput(
1619                "Target must contain both 0 and 1 values".to_string(),
1620            ));
1621        }
1622
1623        // Global WOE (overall log-odds)
1624        self.global_woe_ = (total_events / total_non_events).ln();
1625
1626        let mut woe_maps = Vec::with_capacity(n_features);
1627        let mut information_values = Vec::with_capacity(n_features);
1628
1629        for j in 0..n_features {
1630            // Collect target values by category
1631            let mut category_stats: HashMap<u64, (f64, f64)> = HashMap::new(); // (events, non_events)
1632
1633            for i in 0..n_samples {
1634                let category = x_u64[[i, j]];
1635                let target = y[i];
1636
1637                let (events, non_events) = category_stats.entry(category).or_insert((0.0, 0.0));
1638                if target == 1.0 {
1639                    *events += 1.0;
1640                } else {
1641                    *non_events += 1.0;
1642                }
1643            }
1644
1645            // Calculate WOE and IV for each category
1646            let mut woe_map = HashMap::new();
1647            let mut feature_iv = 0.0;
1648
1649            for (category, (events, non_events)) in category_stats.iter() {
1650                // Add regularization to handle zero counts
1651                let reg_events = events + self.regularization;
1652                let reg_non_events = non_events + self.regularization;
1653                let reg_total_events =
1654                    total_events + self.regularization * category_stats.len() as f64;
1655                let reg_total_non_events =
1656                    total_non_events + self.regularization * category_stats.len() as f64;
1657
1658                // Calculate distribution percentages
1659                let event_rate = reg_events / reg_total_events;
1660                let non_event_rate = reg_non_events / reg_total_non_events;
1661
1662                // Calculate WOE
1663                let woe = (event_rate / non_event_rate).ln();
1664                woe_map.insert(*category, woe);
1665
1666                // Calculate Information Value contribution
1667                let iv_contribution = (event_rate - non_event_rate) * woe;
1668                feature_iv += iv_contribution;
1669            }
1670
1671            woe_maps.push(woe_map);
1672            information_values.push(feature_iv);
1673        }
1674
1675        self.woe_maps_ = Some(woe_maps);
1676        self.information_values_ = Some(information_values);
1677        self.is_fitted = true;
1678        Ok(())
1679    }
1680
1681    /// Transforms the input data using the fitted WOEEncoder
1682    ///
1683    /// # Arguments
1684    /// * `x` - The input categorical data, shape (n_samples, n_features)
1685    ///
1686    /// # Returns
1687    /// * `Result<Array2<f64>>` - The WOE-encoded data
1688    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1689    where
1690        S: Data,
1691        S::Elem: Float + NumCast,
1692    {
1693        if !self.is_fitted {
1694            return Err(TransformError::TransformationError(
1695                "WOEEncoder has not been fitted".to_string(),
1696            ));
1697        }
1698
1699        let woe_maps = self.woe_maps_.as_ref().expect("Operation failed");
1700
1701        let x_u64 = x.mapv(|x| {
1702            let val_f64 = NumCast::from(x).unwrap_or(0.0);
1703            val_f64 as u64
1704        });
1705
1706        let n_samples = x_u64.shape()[0];
1707        let n_features = x_u64.shape()[1];
1708
1709        if n_features != woe_maps.len() {
1710            return Err(TransformError::InvalidInput(format!(
1711                "x has {} features, but WOEEncoder was fitted with {} features",
1712                n_features,
1713                woe_maps.len()
1714            )));
1715        }
1716
1717        let mut transformed = Array2::zeros((n_samples, n_features));
1718
1719        for i in 0..n_samples {
1720            for j in 0..n_features {
1721                let category = x_u64[[i, j]];
1722
1723                if let Some(&woe_value) = woe_maps[j].get(&category) {
1724                    transformed[[i, j]] = woe_value;
1725                } else {
1726                    // Handle unknown category
1727                    match self.handleunknown.as_str() {
1728                        "error" => {
1729                            return Err(TransformError::InvalidInput(format!(
1730                                "Unknown category {category} in feature {j}"
1731                            )));
1732                        }
1733                        "global_woe" => {
1734                            transformed[[i, j]] = self.global_woe_;
1735                        }
1736                        "use_encoded_value" => {
1737                            transformed[[i, j]] = self.unknownvalue;
1738                        }
1739                        _ => unreachable!(),
1740                    }
1741                }
1742            }
1743        }
1744
1745        Ok(transformed)
1746    }
1747
1748    /// Fits the encoder and transforms the data in one step
1749    ///
1750    /// # Arguments
1751    /// * `x` - The input categorical data
1752    /// * `y` - The binary target values
1753    ///
1754    /// # Returns
1755    /// * `Result<Array2<f64>>` - The WOE-encoded data
1756    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>, y: &[f64]) -> Result<Array2<f64>>
1757    where
1758        S: Data,
1759        S::Elem: Float + NumCast,
1760    {
1761        self.fit(x, y)?;
1762        self.transform(x)
1763    }
1764
1765    /// Returns whether the encoder has been fitted
1766    pub fn is_fitted(&self) -> bool {
1767        self.is_fitted
1768    }
1769
1770    /// Returns the learned WOE mappings if fitted
1771    pub fn woe_maps(&self) -> Option<&Vec<HashMap<u64, f64>>> {
1772        self.woe_maps_.as_ref()
1773    }
1774
1775    /// Returns the Information Values for each feature if fitted
1776    ///
1777    /// Information Value interpretation:
1778    /// - < 0.02: Not useful for prediction
1779    /// - 0.02 - 0.1: Weak predictive power
1780    /// - 0.1 - 0.3: Medium predictive power  
1781    /// - 0.3 - 0.5: Strong predictive power
1782    /// - > 0.5: Suspicious, too good to be true
1783    pub fn information_values(&self) -> Option<&Vec<f64>> {
1784        self.information_values_.as_ref()
1785    }
1786
1787    /// Returns the global WOE value (overall log-odds)
1788    pub fn global_woe(&self) -> f64 {
1789        self.global_woe_
1790    }
1791
1792    /// Returns features ranked by Information Value (descending order)
1793    ///
1794    /// # Returns
1795    /// * `Option<Vec<(usize, f64)>>` - Vector of (feature_index, information_value) pairs
1796    pub fn feature_importance_ranking(&self) -> Option<Vec<(usize, f64)>> {
1797        self.information_values_.as_ref().map(|ivs| {
1798            let mut ranking: Vec<(usize, f64)> =
1799                ivs.iter().enumerate().map(|(idx, &iv)| (idx, iv)).collect();
1800            ranking.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
1801            ranking
1802        })
1803    }
1804}
1805
1806#[cfg(test)]
1807#[path = "encoding_tests.rs"]
1808mod tests;
scirs2_transform/encoding.rs

scirs2_transform/
encoding.rs