sklears_preprocessing/
encoding.rs

1//! Data encoding and categorical feature transformation utilities
2//!
3//! This module provides comprehensive data encoding implementations including
4//! label encoding, one-hot encoding, ordinal encoding, binary encoding, hash encoding,
5//! frequency encoding, target encoding, feature hashing, categorical transformations,
6//! cardinality reduction, embedding-based encoding, statistical encoding, smoothing techniques,
7//! regularization methods, cross-validation encoding, time-aware encoding, and
8//! high-performance categorical feature processing pipelines. All algorithms have been
9//! refactored into focused modules for better maintainability and comply with SciRS2 Policy.
10
11// FIXME: These modules are not implemented yet - commenting out to allow compilation
12// // Core encoding types and base structures
13// mod encoding_core;
14// pub use encoding_core::{
15//     EncodingProcessor, EncodingConfig, EncodingValidator, EncodingEstimator,
16//     EncodingTransformer, EncodingAnalyzer, CategoricalProcessor, FeatureEncoder
17// };
18
19// // Label encoding and categorical to numerical transformation
20// mod label_encoding;
21// pub use label_encoding::{
22//     LabelEncoder, LabelEncodingConfig, LabelEncodingValidator, CategoricalToNumerical,
23//     StringEncoder, ClassEncoder, IndexMapping, LabelTransformer,
24//     InverseLabelEncoder, LabelMappingAnalyzer, MultiLabelEncoder
25// };
26
27// // One-hot encoding and sparse representation
28// mod onehot_encoding;
29// pub use onehot_encoding::{
30//     OneHotEncoder, OneHotConfig, OneHotValidator, SparseOneHot,
31//     DenseOneHot, BinaryIndicator, CategoricalExpansion, OneHotTransformer,
32//     SparseMatrixEncoder, OneHotOptimizer, MemoryEfficientOneHot
33// };
34
35// // Ordinal encoding and rank-based transformation
36// mod ordinal_encoding;
37// pub use ordinal_encoding::{
38//     OrdinalEncoder, OrdinalConfig, OrdinalValidator, RankBasedEncoder,
39//     OrderedCategorical, OrdinalMapping, CategoryRanking, OrdinalTransformer,
40//     CustomOrderEncoder, SequentialEncoder, OrdinalOptimizer
41// };
42
43// // Binary encoding and bit-based representation
44// mod binary_encoding;
45// pub use binary_encoding::{
46//     BinaryEncoder, BinaryEncoderConfig, BinaryValidator, BitEncoder,
47//     BinaryRepresentation, BitVectorEncoder, CompactBinaryEncoder,
48//     BinaryFeatureGenerator, BinaryTransformer, BinaryOptimizer
49// };
50
51// // Hash encoding and feature hashing
52// mod hash_encoding;
53// pub use hash_encoding::{
54//     HashEncoder, HashEncoderConfig, HashValidator, FeatureHashing,
55//     HashingTrick, CollisionHandling, HashFunction, MurmurHashEncoder,
56//     CityHashEncoder, HashOptimizer, ConsistentHashing, HashAnalyzer
57// };
58
59// // Frequency encoding and count-based transformation
60// mod frequency_encoding;
61// pub use frequency_encoding::{
62//     FrequencyEncoder, FrequencyEncoderConfig, FrequencyValidator, CountEncoder,
63//     CategoryFrequency, FrequencyTransformer, CountBasedEncoder, RareCategoryHandler,
64//     FrequencyBinning, FrequencyOptimizer, StatisticalFrequencyEncoder
65// };
66
67// // Target encoding and statistical encoding
68// mod target_encoding;
69// pub use target_encoding::{
70//     TargetEncoder, TargetEncodingConfig, TargetValidator, MeanTargetEncoder,
71//     BayesianTargetEncoder, SmoothTargetEncoder, CrossValidationTargetEncoder,
72//     RegularizedTargetEncoder, TargetStatistics, TargetOptimizer, LeaveOneOutEncoder
73// };
74
75// // Embedding-based encoding and learned representations
76// mod embedding_encoding;
77// pub use embedding_encoding::{
78//     EmbeddingEncoder, EmbeddingConfig, EmbeddingValidator, LearnedEmbedding,
79//     CategoricalEmbedding, NeuralEmbedding, Word2VecEncoder, AutoencoderEmbedding,
80//     EmbeddingTransformer, DimensionalityReducedEmbedding, EmbeddingOptimizer
81// };
82
83// // High-cardinality encoding and dimensionality reduction
84// mod cardinality_reduction;
85// pub use cardinality_reduction::{
86//     CardinalityReducer, CardinalityConfig, CardinalityValidator, HighCardinalityHandler,
87//     RareCategoryGrouping, TopKCategorySelector, FrequencyBasedReduction,
88//     HierarchicalGrouping, CardinalityOptimizer, CategoryConsolidator
89// };
90
91// FIXME: Additional modules not implemented yet - commenting out to allow compilation
92// // Time-aware encoding and temporal features
93// mod temporal_encoding;
94// pub use temporal_encoding::{
95//     TemporalEncoder, TemporalConfig, TemporalValidator, TimeAwareEncoder,
96//     SeasonalEncoder, CyclicEncoder, DateTimeEncoder, TimeSeriesEncoder,
97//     TemporalFeatureExtractor, TimestampEncoder, TemporalOptimizer
98// };
99
100// // Cross-validation and robust encoding methods
101// mod crossval_encoding;
102// pub use crossval_encoding::{
103//     CrossValidationEncoder, CVEncodingConfig, CVValidator, FoldBasedEncoder,
104//     KFoldEncoder, StratifiedEncoder, TimeSeriesCVEncoder, RobustEncoder,
105//     LeaveOneOutEncoder, CrossValidationOptimizer, ValidationAwareEncoder
106// };
107
108// // Smoothing and regularization techniques
109// mod smoothing_techniques;
110// pub use smoothing_techniques::{
111//     SmoothingEncoder, SmoothingConfig, SmoothingValidator, BayesianSmoothing,
112//     LaplaceSmoothing, JamesSteinSmoothing, EmpiricalBayesSmoothing,
113//     AdaptiveSmoothing, SmoothingOptimizer, RegularizedSmoothing
114// };
115
116// // Performance optimization and computational efficiency
117// mod performance_optimization;
118// pub use performance_optimization::{
119//     EncodingPerformanceOptimizer, ComputationalEfficiency, MemoryOptimizer,
120//     AlgorithmicOptimizer, CacheOptimizer, ParallelEncodingProcessor
121// };
122
123// // Utilities and helper functions
124// mod encoding_utilities;
125// pub use encoding_utilities::{
126//     EncodingUtilities, CategoricalMathUtils, EncodingAnalysisUtils, ValidationUtils,
127//     ComputationalUtils, HelperFunctions, StatisticalUtils, UtilityValidator
128// };
129
130// FIXME: Re-exports commented out since modules are not implemented
131// // Re-export main encoding classes for backwards compatibility
132// pub use label_encoding::LabelEncoder;
133// pub use onehot_encoding::OneHotEncoder;
134// pub use ordinal_encoding::OrdinalEncoder;
135// pub use binary_encoding::{BinaryEncoder, BinaryEncoderConfig};
136// pub use hash_encoding::{HashEncoder, HashEncoderConfig};
137// pub use frequency_encoding::{FrequencyEncoder, FrequencyEncoderConfig};
138// pub use target_encoding::TargetEncoder;
139// pub use embedding_encoding::EmbeddingEncoder;
140// pub use cardinality_reduction::CardinalityReducer;
141
142// FIXME: Re-export common configurations and types (commented out until modules are implemented)
143// pub use encoding_core::EncodingConfig;
144// pub use label_encoding::LabelEncodingConfig;
145// pub use onehot_encoding::OneHotConfig;
146// pub use ordinal_encoding::OrdinalConfig;
147// pub use binary_encoding::BinaryEncoderConfig;
148// pub use hash_encoding::HashEncoderConfig;
149// pub use frequency_encoding::FrequencyEncoderConfig;
150// pub use target_encoding::TargetEncodingConfig;
151// pub use embedding_encoding::EmbeddingConfig;
152// pub use temporal_encoding::TemporalConfig;
153
154// Actual implementations of encoding functionality
155
156use sklears_core::{
157    error::{Result, SklearsError},
158    traits::{Estimator, Fit, Trained, Untrained},
159    types::Float,
160};
161use std::collections::HashMap;
162use std::marker::PhantomData;
163
164#[cfg(feature = "serde")]
165use serde::{Deserialize, Serialize};
166
167/// Configuration for BinaryEncoder
168#[derive(Debug, Clone)]
169#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
170pub struct BinaryEncoderConfig {
171    /// Whether to drop the first binary column to avoid collinearity
172    pub drop_first: bool,
173    /// How to handle unknown categories during transform
174    pub handle_unknown: UnknownStrategy,
175    /// Whether to use base-2 encoding (true) or natural binary representation (false)
176    pub use_base2: bool,
177}
178
179impl Default for BinaryEncoderConfig {
180    fn default() -> Self {
181        Self {
182            drop_first: false,
183            handle_unknown: UnknownStrategy::Error,
184            use_base2: true,
185        }
186    }
187}
188
189/// Strategy for handling unknown categories
190#[derive(Debug, Clone, PartialEq, Eq)]
191#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
192pub enum UnknownStrategy {
193    /// Raise an error when unknown category is encountered
194    Error,
195    /// Assign unknown categories to a special "unknown" encoding
196    Ignore,
197    /// Use all zeros for unknown categories
198    Zero,
199}
200
201/// Binary encoder for high-cardinality categorical features
202pub struct BinaryEncoder<State = Untrained> {
203    config: BinaryEncoderConfig,
204    fitted_state: Option<BinaryEncoderFitted>,
205    state: PhantomData<State>,
206}
207
208/// Fitted state of BinaryEncoder
209#[derive(Debug, Clone)]
210#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
211pub struct BinaryEncoderFitted {
212    config: BinaryEncoderConfig,
213    /// Mapping from category to binary index
214    category_mapping: HashMap<String, usize>,
215    /// Number of binary columns needed
216    n_binary_cols: usize,
217    /// Categories seen during fitting
218    categories: Vec<String>,
219}
220
221impl BinaryEncoder<Untrained> {
222    /// Create a new BinaryEncoder
223    pub fn new() -> Self {
224        Self {
225            config: BinaryEncoderConfig::default(),
226            fitted_state: None,
227            state: PhantomData,
228        }
229    }
230
231    /// Set whether to drop the first column
232    pub fn drop_first(mut self, drop_first: bool) -> Self {
233        self.config.drop_first = drop_first;
234        self
235    }
236
237    /// Set the strategy for handling unknown categories
238    pub fn handle_unknown(mut self, strategy: UnknownStrategy) -> Self {
239        self.config.handle_unknown = strategy;
240        self
241    }
242
243    /// Set whether to use base-2 encoding
244    pub fn use_base2(mut self, use_base2: bool) -> Self {
245        self.config.use_base2 = use_base2;
246        self
247    }
248}
249
250impl Default for BinaryEncoder<Untrained> {
251    fn default() -> Self {
252        Self::new()
253    }
254}
255
256impl Estimator for BinaryEncoder<Untrained> {
257    type Config = BinaryEncoderConfig;
258    type Error = SklearsError;
259    type Float = Float;
260
261    fn config(&self) -> &Self::Config {
262        &self.config
263    }
264}
265
266impl Estimator for BinaryEncoder<Trained> {
267    type Config = BinaryEncoderConfig;
268    type Error = SklearsError;
269    type Float = Float;
270
271    fn config(&self) -> &Self::Config {
272        &self.fitted_state().config
273    }
274}
275
276impl BinaryEncoder<Trained> {
277    fn fitted_state(&self) -> &BinaryEncoderFitted {
278        self.fitted_state
279            .as_ref()
280            .expect("BinaryEncoder<Trained> must have fitted_state")
281    }
282}
283
284impl Fit<Vec<String>, ()> for BinaryEncoder<Untrained> {
285    type Fitted = BinaryEncoder<Trained>;
286
287    fn fit(self, x: &Vec<String>, _y: &()) -> Result<Self::Fitted> {
288        // Extract unique categories and sort them for deterministic encoding
289        let mut categories = x.clone();
290        categories.sort();
291        categories.dedup();
292
293        let n_categories = categories.len();
294
295        // Calculate number of binary columns needed for encoding
296        // log2(n_categories) rounded up gives the minimum bits needed
297        let n_binary_cols = if n_categories <= 1 {
298            1
299        } else {
300            (n_categories as f64).log2().ceil() as usize
301        };
302
303        // Create mapping from category to its index
304        let category_mapping: HashMap<String, usize> = categories
305            .iter()
306            .enumerate()
307            .map(|(i, cat)| (cat.clone(), i))
308            .collect();
309
310        // Create fitted state
311        let fitted_state = BinaryEncoderFitted {
312            config: self.config.clone(),
313            category_mapping,
314            n_binary_cols,
315            categories,
316        };
317
318        // Return trained encoder with fitted state
319        Ok(BinaryEncoder {
320            config: self.config,
321            fitted_state: Some(fitted_state),
322            state: PhantomData,
323        })
324    }
325}
326
327/// Configuration for HashEncoder
328#[derive(Debug, Clone)]
329#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
330pub struct HashEncoderConfig {
331    /// Number of hash buckets
332    pub n_components: usize,
333    /// Hash function to use
334    pub hash_method: HashMethod,
335    /// Whether to use signed hash (can have negative values)
336    pub alternate_sign: bool,
337}
338
339impl Default for HashEncoderConfig {
340    fn default() -> Self {
341        Self {
342            n_components: 32,
343            hash_method: HashMethod::Md5,
344            alternate_sign: true,
345        }
346    }
347}
348
349/// Hash function options
350#[derive(Debug, Clone, PartialEq, Eq)]
351#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
352pub enum HashMethod {
353    /// MD5 hash function
354    Md5,
355    /// Simple modulo hash
356    Modulo,
357}
358
359/// Hash encoder for categorical features using feature hashing
360pub struct HashEncoder<State = Untrained> {
361    config: HashEncoderConfig,
362    state: PhantomData<State>,
363}
364
365impl HashEncoder<Untrained> {
366    /// Create a new HashEncoder
367    pub fn new() -> Self {
368        Self {
369            config: HashEncoderConfig::default(),
370            state: PhantomData,
371        }
372    }
373
374    /// Set the number of hash components
375    pub fn n_components(mut self, n_components: usize) -> Self {
376        self.config.n_components = n_components;
377        self
378    }
379
380    /// Set the hash method
381    pub fn hash_method(mut self, method: HashMethod) -> Self {
382        self.config.hash_method = method;
383        self
384    }
385}
386
387impl Default for HashEncoder<Untrained> {
388    fn default() -> Self {
389        Self::new()
390    }
391}
392
393/// Frequency encoder configuration
394#[derive(Debug, Clone)]
395#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
396pub struct FrequencyEncoderConfig {
397    /// Whether to normalize frequencies to probabilities
398    pub normalize: bool,
399    /// Strategy for handling rare categories
400    pub rare_strategy: RareStrategy,
401    /// Threshold for considering categories as rare
402    pub rare_threshold: usize,
403}
404
405impl Default for FrequencyEncoderConfig {
406    fn default() -> Self {
407        Self {
408            normalize: false,
409            rare_strategy: RareStrategy::Keep,
410            rare_threshold: 1,
411        }
412    }
413}
414
415/// Strategy for handling rare categories
416#[derive(Debug, Clone, PartialEq, Eq)]
417#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
418pub enum RareStrategy {
419    /// Keep rare categories as-is
420    Keep,
421    /// Group rare categories together
422    Group,
423    /// Replace rare categories with mean frequency
424    MeanFrequency,
425}
426
427/// Frequency encoder transforms categories to their occurrence frequencies
428pub struct FrequencyEncoder<State = Untrained> {
429    config: FrequencyEncoderConfig,
430    state: PhantomData<State>,
431}
432
433impl FrequencyEncoder<Untrained> {
434    /// Create a new FrequencyEncoder
435    pub fn new() -> Self {
436        Self {
437            config: FrequencyEncoderConfig::default(),
438            state: PhantomData,
439        }
440    }
441
442    /// Set whether to normalize frequencies
443    pub fn normalize(mut self, normalize: bool) -> Self {
444        self.config.normalize = normalize;
445        self
446    }
447
448    /// Set the rare category strategy
449    pub fn rare_strategy(mut self, strategy: RareStrategy) -> Self {
450        self.config.rare_strategy = strategy;
451        self
452    }
453}
454
455impl Default for FrequencyEncoder<Untrained> {
456    fn default() -> Self {
457        Self::new()
458    }
459}
460
461/// Configuration for CategoricalEmbedding
462#[derive(Debug, Clone)]
463#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
464pub struct CategoricalEmbeddingConfig {
465    /// Embedding dimension
466    pub embedding_dim: usize,
467    /// Learning rate for training
468    pub learning_rate: Float,
469    /// Number of training epochs
470    pub epochs: usize,
471    /// Batch size for training
472    pub batch_size: usize,
473}
474
475impl Default for CategoricalEmbeddingConfig {
476    fn default() -> Self {
477        Self {
478            embedding_dim: 50,
479            learning_rate: 0.01,
480            epochs: 100,
481            batch_size: 32,
482        }
483    }
484}
485
486/// Categorical embedding using neural network-style embeddings
487pub struct CategoricalEmbedding<State = Untrained> {
488    config: CategoricalEmbeddingConfig,
489    state: PhantomData<State>,
490}
491
492impl CategoricalEmbedding<Untrained> {
493    /// Create a new CategoricalEmbedding
494    pub fn new() -> Self {
495        Self {
496            config: CategoricalEmbeddingConfig::default(),
497            state: PhantomData,
498        }
499    }
500
501    /// Set the embedding dimension
502    pub fn embedding_dim(mut self, dim: usize) -> Self {
503        self.config.embedding_dim = dim;
504        self
505    }
506
507    /// Set the learning rate
508    pub fn learning_rate(mut self, lr: Float) -> Self {
509        self.config.learning_rate = lr;
510        self
511    }
512}
513
514impl Default for CategoricalEmbedding<Untrained> {
515    fn default() -> Self {
516        Self::new()
517    }
518}
519
520// Placeholder implementations for the basic encoders
521// These should be replaced with full implementations
522
523/// Label encoder for transforming categorical labels to integers
524#[derive(Debug, Clone, Default)]
525pub struct LabelEncoder {
526    // Placeholder - should implement proper label encoding
527}
528
529/// One-hot encoder for categorical features
530#[derive(Debug, Clone, Default)]
531pub struct OneHotEncoder {
532    // Placeholder - should implement proper one-hot encoding
533}
534
535/// Ordinal encoder for categorical features with inherent ordering
536#[derive(Debug, Clone, Default)]
537pub struct OrdinalEncoder {
538    // Placeholder - should implement proper ordinal encoding
539}
540
541/// Target encoder using target statistics for categorical encoding
542#[derive(Debug, Clone, Default)]
543pub struct TargetEncoder {
544    // Placeholder - should implement proper target encoding
545}