sklears_preprocessing/encoding.rs
1//! Data encoding and categorical feature transformation utilities
2//!
3//! This module provides comprehensive data encoding implementations including
4//! label encoding, one-hot encoding, ordinal encoding, binary encoding, hash encoding,
5//! frequency encoding, target encoding, feature hashing, categorical transformations,
6//! cardinality reduction, embedding-based encoding, statistical encoding, smoothing techniques,
7//! regularization methods, cross-validation encoding, time-aware encoding, and
8//! high-performance categorical feature processing pipelines. All algorithms have been
9//! refactored into focused modules for better maintainability and comply with SciRS2 Policy.
10
11// FIXME: These modules are not implemented yet - commenting out to allow compilation
12// // Core encoding types and base structures
13// mod encoding_core;
14// pub use encoding_core::{
15// EncodingProcessor, EncodingConfig, EncodingValidator, EncodingEstimator,
16// EncodingTransformer, EncodingAnalyzer, CategoricalProcessor, FeatureEncoder
17// };
18
19// // Label encoding and categorical to numerical transformation
20// mod label_encoding;
21// pub use label_encoding::{
22// LabelEncoder, LabelEncodingConfig, LabelEncodingValidator, CategoricalToNumerical,
23// StringEncoder, ClassEncoder, IndexMapping, LabelTransformer,
24// InverseLabelEncoder, LabelMappingAnalyzer, MultiLabelEncoder
25// };
26
27// // One-hot encoding and sparse representation
28// mod onehot_encoding;
29// pub use onehot_encoding::{
30// OneHotEncoder, OneHotConfig, OneHotValidator, SparseOneHot,
31// DenseOneHot, BinaryIndicator, CategoricalExpansion, OneHotTransformer,
32// SparseMatrixEncoder, OneHotOptimizer, MemoryEfficientOneHot
33// };
34
35// // Ordinal encoding and rank-based transformation
36// mod ordinal_encoding;
37// pub use ordinal_encoding::{
38// OrdinalEncoder, OrdinalConfig, OrdinalValidator, RankBasedEncoder,
39// OrderedCategorical, OrdinalMapping, CategoryRanking, OrdinalTransformer,
40// CustomOrderEncoder, SequentialEncoder, OrdinalOptimizer
41// };
42
43// // Binary encoding and bit-based representation
44// mod binary_encoding;
45// pub use binary_encoding::{
46// BinaryEncoder, BinaryEncoderConfig, BinaryValidator, BitEncoder,
47// BinaryRepresentation, BitVectorEncoder, CompactBinaryEncoder,
48// BinaryFeatureGenerator, BinaryTransformer, BinaryOptimizer
49// };
50
51// // Hash encoding and feature hashing
52// mod hash_encoding;
53// pub use hash_encoding::{
54// HashEncoder, HashEncoderConfig, HashValidator, FeatureHashing,
55// HashingTrick, CollisionHandling, HashFunction, MurmurHashEncoder,
56// CityHashEncoder, HashOptimizer, ConsistentHashing, HashAnalyzer
57// };
58
59// // Frequency encoding and count-based transformation
60// mod frequency_encoding;
61// pub use frequency_encoding::{
62// FrequencyEncoder, FrequencyEncoderConfig, FrequencyValidator, CountEncoder,
63// CategoryFrequency, FrequencyTransformer, CountBasedEncoder, RareCategoryHandler,
64// FrequencyBinning, FrequencyOptimizer, StatisticalFrequencyEncoder
65// };
66
67// // Target encoding and statistical encoding
68// mod target_encoding;
69// pub use target_encoding::{
70// TargetEncoder, TargetEncodingConfig, TargetValidator, MeanTargetEncoder,
71// BayesianTargetEncoder, SmoothTargetEncoder, CrossValidationTargetEncoder,
72// RegularizedTargetEncoder, TargetStatistics, TargetOptimizer, LeaveOneOutEncoder
73// };
74
75// // Embedding-based encoding and learned representations
76// mod embedding_encoding;
77// pub use embedding_encoding::{
78// EmbeddingEncoder, EmbeddingConfig, EmbeddingValidator, LearnedEmbedding,
79// CategoricalEmbedding, NeuralEmbedding, Word2VecEncoder, AutoencoderEmbedding,
80// EmbeddingTransformer, DimensionalityReducedEmbedding, EmbeddingOptimizer
81// };
82
83// // High-cardinality encoding and dimensionality reduction
84// mod cardinality_reduction;
85// pub use cardinality_reduction::{
86// CardinalityReducer, CardinalityConfig, CardinalityValidator, HighCardinalityHandler,
87// RareCategoryGrouping, TopKCategorySelector, FrequencyBasedReduction,
88// HierarchicalGrouping, CardinalityOptimizer, CategoryConsolidator
89// };
90
91// FIXME: Additional modules not implemented yet - commenting out to allow compilation
92// // Time-aware encoding and temporal features
93// mod temporal_encoding;
94// pub use temporal_encoding::{
95// TemporalEncoder, TemporalConfig, TemporalValidator, TimeAwareEncoder,
96// SeasonalEncoder, CyclicEncoder, DateTimeEncoder, TimeSeriesEncoder,
97// TemporalFeatureExtractor, TimestampEncoder, TemporalOptimizer
98// };
99
100// // Cross-validation and robust encoding methods
101// mod crossval_encoding;
102// pub use crossval_encoding::{
103// CrossValidationEncoder, CVEncodingConfig, CVValidator, FoldBasedEncoder,
104// KFoldEncoder, StratifiedEncoder, TimeSeriesCVEncoder, RobustEncoder,
105// LeaveOneOutEncoder, CrossValidationOptimizer, ValidationAwareEncoder
106// };
107
108// // Smoothing and regularization techniques
109// mod smoothing_techniques;
110// pub use smoothing_techniques::{
111// SmoothingEncoder, SmoothingConfig, SmoothingValidator, BayesianSmoothing,
112// LaplaceSmoothing, JamesSteinSmoothing, EmpiricalBayesSmoothing,
113// AdaptiveSmoothing, SmoothingOptimizer, RegularizedSmoothing
114// };
115
116// // Performance optimization and computational efficiency
117// mod performance_optimization;
118// pub use performance_optimization::{
119// EncodingPerformanceOptimizer, ComputationalEfficiency, MemoryOptimizer,
120// AlgorithmicOptimizer, CacheOptimizer, ParallelEncodingProcessor
121// };
122
123// // Utilities and helper functions
124// mod encoding_utilities;
125// pub use encoding_utilities::{
126// EncodingUtilities, CategoricalMathUtils, EncodingAnalysisUtils, ValidationUtils,
127// ComputationalUtils, HelperFunctions, StatisticalUtils, UtilityValidator
128// };
129
130// FIXME: Re-exports commented out since modules are not implemented
131// // Re-export main encoding classes for backwards compatibility
132// pub use label_encoding::LabelEncoder;
133// pub use onehot_encoding::OneHotEncoder;
134// pub use ordinal_encoding::OrdinalEncoder;
135// pub use binary_encoding::{BinaryEncoder, BinaryEncoderConfig};
136// pub use hash_encoding::{HashEncoder, HashEncoderConfig};
137// pub use frequency_encoding::{FrequencyEncoder, FrequencyEncoderConfig};
138// pub use target_encoding::TargetEncoder;
139// pub use embedding_encoding::EmbeddingEncoder;
140// pub use cardinality_reduction::CardinalityReducer;
141
142// FIXME: Re-export common configurations and types (commented out until modules are implemented)
143// pub use encoding_core::EncodingConfig;
144// pub use label_encoding::LabelEncodingConfig;
145// pub use onehot_encoding::OneHotConfig;
146// pub use ordinal_encoding::OrdinalConfig;
147// pub use binary_encoding::BinaryEncoderConfig;
148// pub use hash_encoding::HashEncoderConfig;
149// pub use frequency_encoding::FrequencyEncoderConfig;
150// pub use target_encoding::TargetEncodingConfig;
151// pub use embedding_encoding::EmbeddingConfig;
152// pub use temporal_encoding::TemporalConfig;
153
154// Actual implementations of encoding functionality
155
156use sklears_core::{
157 error::{Result, SklearsError},
158 traits::{Estimator, Fit, Trained, Untrained},
159 types::Float,
160};
161use std::collections::HashMap;
162use std::marker::PhantomData;
163
164#[cfg(feature = "serde")]
165use serde::{Deserialize, Serialize};
166
167/// Configuration for BinaryEncoder
168#[derive(Debug, Clone)]
169#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
170pub struct BinaryEncoderConfig {
171 /// Whether to drop the first binary column to avoid collinearity
172 pub drop_first: bool,
173 /// How to handle unknown categories during transform
174 pub handle_unknown: UnknownStrategy,
175 /// Whether to use base-2 encoding (true) or natural binary representation (false)
176 pub use_base2: bool,
177}
178
179impl Default for BinaryEncoderConfig {
180 fn default() -> Self {
181 Self {
182 drop_first: false,
183 handle_unknown: UnknownStrategy::Error,
184 use_base2: true,
185 }
186 }
187}
188
189/// Strategy for handling unknown categories
190#[derive(Debug, Clone, PartialEq, Eq)]
191#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
192pub enum UnknownStrategy {
193 /// Raise an error when unknown category is encountered
194 Error,
195 /// Assign unknown categories to a special "unknown" encoding
196 Ignore,
197 /// Use all zeros for unknown categories
198 Zero,
199}
200
201/// Binary encoder for high-cardinality categorical features
202pub struct BinaryEncoder<State = Untrained> {
203 config: BinaryEncoderConfig,
204 fitted_state: Option<BinaryEncoderFitted>,
205 state: PhantomData<State>,
206}
207
208/// Fitted state of BinaryEncoder
209#[derive(Debug, Clone)]
210#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
211pub struct BinaryEncoderFitted {
212 config: BinaryEncoderConfig,
213 /// Mapping from category to binary index
214 category_mapping: HashMap<String, usize>,
215 /// Number of binary columns needed
216 n_binary_cols: usize,
217 /// Categories seen during fitting
218 categories: Vec<String>,
219}
220
221impl BinaryEncoder<Untrained> {
222 /// Create a new BinaryEncoder
223 pub fn new() -> Self {
224 Self {
225 config: BinaryEncoderConfig::default(),
226 fitted_state: None,
227 state: PhantomData,
228 }
229 }
230
231 /// Set whether to drop the first column
232 pub fn drop_first(mut self, drop_first: bool) -> Self {
233 self.config.drop_first = drop_first;
234 self
235 }
236
237 /// Set the strategy for handling unknown categories
238 pub fn handle_unknown(mut self, strategy: UnknownStrategy) -> Self {
239 self.config.handle_unknown = strategy;
240 self
241 }
242
243 /// Set whether to use base-2 encoding
244 pub fn use_base2(mut self, use_base2: bool) -> Self {
245 self.config.use_base2 = use_base2;
246 self
247 }
248}
249
250impl Default for BinaryEncoder<Untrained> {
251 fn default() -> Self {
252 Self::new()
253 }
254}
255
256impl Estimator for BinaryEncoder<Untrained> {
257 type Config = BinaryEncoderConfig;
258 type Error = SklearsError;
259 type Float = Float;
260
261 fn config(&self) -> &Self::Config {
262 &self.config
263 }
264}
265
266impl Estimator for BinaryEncoder<Trained> {
267 type Config = BinaryEncoderConfig;
268 type Error = SklearsError;
269 type Float = Float;
270
271 fn config(&self) -> &Self::Config {
272 &self.fitted_state().config
273 }
274}
275
276impl BinaryEncoder<Trained> {
277 fn fitted_state(&self) -> &BinaryEncoderFitted {
278 self.fitted_state
279 .as_ref()
280 .expect("BinaryEncoder<Trained> must have fitted_state")
281 }
282}
283
284impl Fit<Vec<String>, ()> for BinaryEncoder<Untrained> {
285 type Fitted = BinaryEncoder<Trained>;
286
287 fn fit(self, x: &Vec<String>, _y: &()) -> Result<Self::Fitted> {
288 // Extract unique categories and sort them for deterministic encoding
289 let mut categories = x.clone();
290 categories.sort();
291 categories.dedup();
292
293 let n_categories = categories.len();
294
295 // Calculate number of binary columns needed for encoding
296 // log2(n_categories) rounded up gives the minimum bits needed
297 let n_binary_cols = if n_categories <= 1 {
298 1
299 } else {
300 (n_categories as f64).log2().ceil() as usize
301 };
302
303 // Create mapping from category to its index
304 let category_mapping: HashMap<String, usize> = categories
305 .iter()
306 .enumerate()
307 .map(|(i, cat)| (cat.clone(), i))
308 .collect();
309
310 // Create fitted state
311 let fitted_state = BinaryEncoderFitted {
312 config: self.config.clone(),
313 category_mapping,
314 n_binary_cols,
315 categories,
316 };
317
318 // Return trained encoder with fitted state
319 Ok(BinaryEncoder {
320 config: self.config,
321 fitted_state: Some(fitted_state),
322 state: PhantomData,
323 })
324 }
325}
326
327/// Configuration for HashEncoder
328#[derive(Debug, Clone)]
329#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
330pub struct HashEncoderConfig {
331 /// Number of hash buckets
332 pub n_components: usize,
333 /// Hash function to use
334 pub hash_method: HashMethod,
335 /// Whether to use signed hash (can have negative values)
336 pub alternate_sign: bool,
337}
338
339impl Default for HashEncoderConfig {
340 fn default() -> Self {
341 Self {
342 n_components: 32,
343 hash_method: HashMethod::Md5,
344 alternate_sign: true,
345 }
346 }
347}
348
349/// Hash function options
350#[derive(Debug, Clone, PartialEq, Eq)]
351#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
352pub enum HashMethod {
353 /// MD5 hash function
354 Md5,
355 /// Simple modulo hash
356 Modulo,
357}
358
359/// Hash encoder for categorical features using feature hashing
360pub struct HashEncoder<State = Untrained> {
361 config: HashEncoderConfig,
362 state: PhantomData<State>,
363}
364
365impl HashEncoder<Untrained> {
366 /// Create a new HashEncoder
367 pub fn new() -> Self {
368 Self {
369 config: HashEncoderConfig::default(),
370 state: PhantomData,
371 }
372 }
373
374 /// Set the number of hash components
375 pub fn n_components(mut self, n_components: usize) -> Self {
376 self.config.n_components = n_components;
377 self
378 }
379
380 /// Set the hash method
381 pub fn hash_method(mut self, method: HashMethod) -> Self {
382 self.config.hash_method = method;
383 self
384 }
385}
386
387impl Default for HashEncoder<Untrained> {
388 fn default() -> Self {
389 Self::new()
390 }
391}
392
393/// Frequency encoder configuration
394#[derive(Debug, Clone)]
395#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
396pub struct FrequencyEncoderConfig {
397 /// Whether to normalize frequencies to probabilities
398 pub normalize: bool,
399 /// Strategy for handling rare categories
400 pub rare_strategy: RareStrategy,
401 /// Threshold for considering categories as rare
402 pub rare_threshold: usize,
403}
404
405impl Default for FrequencyEncoderConfig {
406 fn default() -> Self {
407 Self {
408 normalize: false,
409 rare_strategy: RareStrategy::Keep,
410 rare_threshold: 1,
411 }
412 }
413}
414
415/// Strategy for handling rare categories
416#[derive(Debug, Clone, PartialEq, Eq)]
417#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
418pub enum RareStrategy {
419 /// Keep rare categories as-is
420 Keep,
421 /// Group rare categories together
422 Group,
423 /// Replace rare categories with mean frequency
424 MeanFrequency,
425}
426
427/// Frequency encoder transforms categories to their occurrence frequencies
428pub struct FrequencyEncoder<State = Untrained> {
429 config: FrequencyEncoderConfig,
430 state: PhantomData<State>,
431}
432
433impl FrequencyEncoder<Untrained> {
434 /// Create a new FrequencyEncoder
435 pub fn new() -> Self {
436 Self {
437 config: FrequencyEncoderConfig::default(),
438 state: PhantomData,
439 }
440 }
441
442 /// Set whether to normalize frequencies
443 pub fn normalize(mut self, normalize: bool) -> Self {
444 self.config.normalize = normalize;
445 self
446 }
447
448 /// Set the rare category strategy
449 pub fn rare_strategy(mut self, strategy: RareStrategy) -> Self {
450 self.config.rare_strategy = strategy;
451 self
452 }
453}
454
455impl Default for FrequencyEncoder<Untrained> {
456 fn default() -> Self {
457 Self::new()
458 }
459}
460
461/// Configuration for CategoricalEmbedding
462#[derive(Debug, Clone)]
463#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
464pub struct CategoricalEmbeddingConfig {
465 /// Embedding dimension
466 pub embedding_dim: usize,
467 /// Learning rate for training
468 pub learning_rate: Float,
469 /// Number of training epochs
470 pub epochs: usize,
471 /// Batch size for training
472 pub batch_size: usize,
473}
474
475impl Default for CategoricalEmbeddingConfig {
476 fn default() -> Self {
477 Self {
478 embedding_dim: 50,
479 learning_rate: 0.01,
480 epochs: 100,
481 batch_size: 32,
482 }
483 }
484}
485
486/// Categorical embedding using neural network-style embeddings
487pub struct CategoricalEmbedding<State = Untrained> {
488 config: CategoricalEmbeddingConfig,
489 state: PhantomData<State>,
490}
491
492impl CategoricalEmbedding<Untrained> {
493 /// Create a new CategoricalEmbedding
494 pub fn new() -> Self {
495 Self {
496 config: CategoricalEmbeddingConfig::default(),
497 state: PhantomData,
498 }
499 }
500
501 /// Set the embedding dimension
502 pub fn embedding_dim(mut self, dim: usize) -> Self {
503 self.config.embedding_dim = dim;
504 self
505 }
506
507 /// Set the learning rate
508 pub fn learning_rate(mut self, lr: Float) -> Self {
509 self.config.learning_rate = lr;
510 self
511 }
512}
513
514impl Default for CategoricalEmbedding<Untrained> {
515 fn default() -> Self {
516 Self::new()
517 }
518}
519
520// Placeholder implementations for the basic encoders
521// These should be replaced with full implementations
522
523/// Label encoder for transforming categorical labels to integers
524#[derive(Debug, Clone, Default)]
525pub struct LabelEncoder {
526 // Placeholder - should implement proper label encoding
527}
528
529/// One-hot encoder for categorical features
530#[derive(Debug, Clone, Default)]
531pub struct OneHotEncoder {
532 // Placeholder - should implement proper one-hot encoding
533}
534
535/// Ordinal encoder for categorical features with inherent ordering
536#[derive(Debug, Clone, Default)]
537pub struct OrdinalEncoder {
538 // Placeholder - should implement proper ordinal encoding
539}
540
541/// Target encoder using target statistics for categorical encoding
542#[derive(Debug, Clone, Default)]
543pub struct TargetEncoder {
544 // Placeholder - should implement proper target encoding
545}