sklears_preprocessing/encoding.rs
1//! Data encoding and categorical feature transformation utilities
2//!
3//! This module provides comprehensive data encoding implementations including
4//! label encoding, one-hot encoding, ordinal encoding, binary encoding, hash encoding,
5//! frequency encoding, target encoding, feature hashing, categorical transformations,
6//! cardinality reduction, embedding-based encoding, statistical encoding, smoothing techniques,
7//! regularization methods, cross-validation encoding, time-aware encoding, and
8//! high-performance categorical feature processing pipelines. All algorithms have been
9//! refactored into focused modules for better maintainability and comply with SciRS2 Policy.
10
11// FIXME: These modules are not implemented yet - commenting out to allow compilation
12// // Core encoding types and base structures
13// mod encoding_core;
14// pub use encoding_core::{
15// EncodingProcessor, EncodingConfig, EncodingValidator, EncodingEstimator,
16// EncodingTransformer, EncodingAnalyzer, CategoricalProcessor, FeatureEncoder
17// };
18
19// // Label encoding and categorical to numerical transformation
20// mod label_encoding;
21// pub use label_encoding::{
22// LabelEncoder, LabelEncodingConfig, LabelEncodingValidator, CategoricalToNumerical,
23// StringEncoder, ClassEncoder, IndexMapping, LabelTransformer,
24// InverseLabelEncoder, LabelMappingAnalyzer, MultiLabelEncoder
25// };
26
27// // One-hot encoding and sparse representation
28// mod onehot_encoding;
29// pub use onehot_encoding::{
30// OneHotEncoder, OneHotConfig, OneHotValidator, SparseOneHot,
31// DenseOneHot, BinaryIndicator, CategoricalExpansion, OneHotTransformer,
32// SparseMatrixEncoder, OneHotOptimizer, MemoryEfficientOneHot
33// };
34
35// // Ordinal encoding and rank-based transformation
36// mod ordinal_encoding;
37// pub use ordinal_encoding::{
38// OrdinalEncoder, OrdinalConfig, OrdinalValidator, RankBasedEncoder,
39// OrderedCategorical, OrdinalMapping, CategoryRanking, OrdinalTransformer,
40// CustomOrderEncoder, SequentialEncoder, OrdinalOptimizer
41// };
42
43// // Binary encoding and bit-based representation
44// mod binary_encoding;
45// pub use binary_encoding::{
46// BinaryEncoder, BinaryEncoderConfig, BinaryValidator, BitEncoder,
47// BinaryRepresentation, BitVectorEncoder, CompactBinaryEncoder,
48// BinaryFeatureGenerator, BinaryTransformer, BinaryOptimizer
49// };
50
51// // Hash encoding and feature hashing
52// mod hash_encoding;
53// pub use hash_encoding::{
54// HashEncoder, HashEncoderConfig, HashValidator, FeatureHashing,
55// HashingTrick, CollisionHandling, HashFunction, MurmurHashEncoder,
56// CityHashEncoder, HashOptimizer, ConsistentHashing, HashAnalyzer
57// };
58
59// // Frequency encoding and count-based transformation
60// mod frequency_encoding;
61// pub use frequency_encoding::{
62// FrequencyEncoder, FrequencyEncoderConfig, FrequencyValidator, CountEncoder,
63// CategoryFrequency, FrequencyTransformer, CountBasedEncoder, RareCategoryHandler,
64// FrequencyBinning, FrequencyOptimizer, StatisticalFrequencyEncoder
65// };
66
67// // Target encoding and statistical encoding
68// mod target_encoding;
69// pub use target_encoding::{
70// TargetEncoder, TargetEncodingConfig, TargetValidator, MeanTargetEncoder,
71// BayesianTargetEncoder, SmoothTargetEncoder, CrossValidationTargetEncoder,
72// RegularizedTargetEncoder, TargetStatistics, TargetOptimizer, LeaveOneOutEncoder
73// };
74
75// // Embedding-based encoding and learned representations
76// mod embedding_encoding;
77// pub use embedding_encoding::{
78// EmbeddingEncoder, EmbeddingConfig, EmbeddingValidator, LearnedEmbedding,
79// CategoricalEmbedding, NeuralEmbedding, Word2VecEncoder, AutoencoderEmbedding,
80// EmbeddingTransformer, DimensionalityReducedEmbedding, EmbeddingOptimizer
81// };
82
83// // High-cardinality encoding and dimensionality reduction
84// mod cardinality_reduction;
85// pub use cardinality_reduction::{
86// CardinalityReducer, CardinalityConfig, CardinalityValidator, HighCardinalityHandler,
87// RareCategoryGrouping, TopKCategorySelector, FrequencyBasedReduction,
88// HierarchicalGrouping, CardinalityOptimizer, CategoryConsolidator
89// };
90
91// FIXME: Additional modules not implemented yet - commenting out to allow compilation
92// // Time-aware encoding and temporal features
93// mod temporal_encoding;
94// pub use temporal_encoding::{
95// TemporalEncoder, TemporalConfig, TemporalValidator, TimeAwareEncoder,
96// SeasonalEncoder, CyclicEncoder, DateTimeEncoder, TimeSeriesEncoder,
97// TemporalFeatureExtractor, TimestampEncoder, TemporalOptimizer
98// };
99
100// // Cross-validation and robust encoding methods
101// mod crossval_encoding;
102// pub use crossval_encoding::{
103// CrossValidationEncoder, CVEncodingConfig, CVValidator, FoldBasedEncoder,
104// KFoldEncoder, StratifiedEncoder, TimeSeriesCVEncoder, RobustEncoder,
105// LeaveOneOutEncoder, CrossValidationOptimizer, ValidationAwareEncoder
106// };
107
108// // Smoothing and regularization techniques
109// mod smoothing_techniques;
110// pub use smoothing_techniques::{
111// SmoothingEncoder, SmoothingConfig, SmoothingValidator, BayesianSmoothing,
112// LaplaceSmoothing, JamesSteinSmoothing, EmpiricalBayesSmoothing,
113// AdaptiveSmoothing, SmoothingOptimizer, RegularizedSmoothing
114// };
115
116// // Performance optimization and computational efficiency
117// mod performance_optimization;
118// pub use performance_optimization::{
119// EncodingPerformanceOptimizer, ComputationalEfficiency, MemoryOptimizer,
120// AlgorithmicOptimizer, CacheOptimizer, ParallelEncodingProcessor
121// };
122
123// // Utilities and helper functions
124// mod encoding_utilities;
125// pub use encoding_utilities::{
126// EncodingUtilities, CategoricalMathUtils, EncodingAnalysisUtils, ValidationUtils,
127// ComputationalUtils, HelperFunctions, StatisticalUtils, UtilityValidator
128// };
129
130// FIXME: Re-exports commented out since modules are not implemented
131// // Re-export main encoding classes for backwards compatibility
132// pub use label_encoding::LabelEncoder;
133// pub use onehot_encoding::OneHotEncoder;
134// pub use ordinal_encoding::OrdinalEncoder;
135// pub use binary_encoding::{BinaryEncoder, BinaryEncoderConfig};
136// pub use hash_encoding::{HashEncoder, HashEncoderConfig};
137// pub use frequency_encoding::{FrequencyEncoder, FrequencyEncoderConfig};
138// pub use target_encoding::TargetEncoder;
139// pub use embedding_encoding::EmbeddingEncoder;
140// pub use cardinality_reduction::CardinalityReducer;
141
142// FIXME: Re-export common configurations and types (commented out until modules are implemented)
143// pub use encoding_core::EncodingConfig;
144// pub use label_encoding::LabelEncodingConfig;
145// pub use onehot_encoding::OneHotConfig;
146// pub use ordinal_encoding::OrdinalConfig;
147// pub use binary_encoding::BinaryEncoderConfig;
148// pub use hash_encoding::HashEncoderConfig;
149// pub use frequency_encoding::FrequencyEncoderConfig;
150// pub use target_encoding::TargetEncodingConfig;
151// pub use embedding_encoding::EmbeddingConfig;
152// pub use temporal_encoding::TemporalConfig;
153
154// Actual implementations of encoding functionality
155
156use sklears_core::{
157 error::{Result, SklearsError},
158 traits::{Estimator, Fit, Trained, Untrained},
159 types::Float,
160};
161use std::collections::HashMap;
162use std::marker::PhantomData;
163
164#[cfg(feature = "serde")]
165use serde::{Deserialize, Serialize};
166
167/// Configuration for BinaryEncoder
168#[derive(Debug, Clone)]
169#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
170pub struct BinaryEncoderConfig {
171 /// Whether to drop the first binary column to avoid collinearity
172 pub drop_first: bool,
173 /// How to handle unknown categories during transform
174 pub handle_unknown: UnknownStrategy,
175 /// Whether to use base-2 encoding (true) or natural binary representation (false)
176 pub use_base2: bool,
177}
178
179impl Default for BinaryEncoderConfig {
180 fn default() -> Self {
181 Self {
182 drop_first: false,
183 handle_unknown: UnknownStrategy::Error,
184 use_base2: true,
185 }
186 }
187}
188
189/// Strategy for handling unknown categories
190#[derive(Debug, Clone, PartialEq, Eq)]
191#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
192pub enum UnknownStrategy {
193 /// Raise an error when unknown category is encountered
194 Error,
195 /// Assign unknown categories to a special "unknown" encoding
196 Ignore,
197 /// Use all zeros for unknown categories
198 Zero,
199}
200
201/// Binary encoder for high-cardinality categorical features
202pub struct BinaryEncoder<State = Untrained> {
203 config: BinaryEncoderConfig,
204 state: PhantomData<State>,
205}
206
207/// Fitted state of BinaryEncoder
208#[derive(Debug, Clone)]
209#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
210pub struct BinaryEncoderFitted {
211 config: BinaryEncoderConfig,
212 /// Mapping from category to binary index
213 category_mapping: HashMap<String, usize>,
214 /// Number of binary columns needed
215 n_binary_cols: usize,
216 /// Categories seen during fitting
217 categories: Vec<String>,
218}
219
220impl BinaryEncoder<Untrained> {
221 /// Create a new BinaryEncoder
222 pub fn new() -> Self {
223 Self {
224 config: BinaryEncoderConfig::default(),
225 state: PhantomData,
226 }
227 }
228
229 /// Set whether to drop the first column
230 pub fn drop_first(mut self, drop_first: bool) -> Self {
231 self.config.drop_first = drop_first;
232 self
233 }
234
235 /// Set the strategy for handling unknown categories
236 pub fn handle_unknown(mut self, strategy: UnknownStrategy) -> Self {
237 self.config.handle_unknown = strategy;
238 self
239 }
240
241 /// Set whether to use base-2 encoding
242 pub fn use_base2(mut self, use_base2: bool) -> Self {
243 self.config.use_base2 = use_base2;
244 self
245 }
246}
247
248impl Default for BinaryEncoder<Untrained> {
249 fn default() -> Self {
250 Self::new()
251 }
252}
253
254impl Estimator for BinaryEncoder<Untrained> {
255 type Config = BinaryEncoderConfig;
256 type Error = SklearsError;
257 type Float = Float;
258
259 fn config(&self) -> &Self::Config {
260 &self.config
261 }
262}
263
264impl Estimator for BinaryEncoder<Trained> {
265 type Config = BinaryEncoderConfig;
266 type Error = SklearsError;
267 type Float = Float;
268
269 fn config(&self) -> &Self::Config {
270 &self.fitted_state().config
271 }
272}
273
274impl BinaryEncoder<Trained> {
275 fn fitted_state(&self) -> &BinaryEncoderFitted {
276 // This would be properly implemented with the actual fitted state
277 // For now, returning a placeholder
278 unsafe { &*(std::ptr::null::<BinaryEncoderFitted>()) }
279 }
280}
281
282impl Fit<Vec<String>, ()> for BinaryEncoder<Untrained> {
283 type Fitted = BinaryEncoder<Trained>;
284
285 fn fit(self, x: &Vec<String>, _y: &()) -> Result<Self::Fitted> {
286 let mut categories = x.clone();
287 categories.sort();
288 categories.dedup();
289
290 let n_categories = categories.len();
291 let n_binary_cols = if n_categories <= 1 {
292 1
293 } else {
294 (n_categories as f64).log2().ceil() as usize
295 };
296
297 let category_mapping: HashMap<String, usize> = categories
298 .iter()
299 .enumerate()
300 .map(|(i, cat)| (cat.clone(), i))
301 .collect();
302
303 // Note: In a full implementation, this would properly store the fitted state
304 // For now, this is a structural placeholder
305 todo!("Complete implementation requires proper state management")
306 }
307}
308
309/// Configuration for HashEncoder
310#[derive(Debug, Clone)]
311#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
312pub struct HashEncoderConfig {
313 /// Number of hash buckets
314 pub n_components: usize,
315 /// Hash function to use
316 pub hash_method: HashMethod,
317 /// Whether to use signed hash (can have negative values)
318 pub alternate_sign: bool,
319}
320
321impl Default for HashEncoderConfig {
322 fn default() -> Self {
323 Self {
324 n_components: 32,
325 hash_method: HashMethod::Md5,
326 alternate_sign: true,
327 }
328 }
329}
330
331/// Hash function options
332#[derive(Debug, Clone, PartialEq, Eq)]
333#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
334pub enum HashMethod {
335 /// MD5 hash function
336 Md5,
337 /// Simple modulo hash
338 Modulo,
339}
340
341/// Hash encoder for categorical features using feature hashing
342pub struct HashEncoder<State = Untrained> {
343 config: HashEncoderConfig,
344 state: PhantomData<State>,
345}
346
347impl HashEncoder<Untrained> {
348 /// Create a new HashEncoder
349 pub fn new() -> Self {
350 Self {
351 config: HashEncoderConfig::default(),
352 state: PhantomData,
353 }
354 }
355
356 /// Set the number of hash components
357 pub fn n_components(mut self, n_components: usize) -> Self {
358 self.config.n_components = n_components;
359 self
360 }
361
362 /// Set the hash method
363 pub fn hash_method(mut self, method: HashMethod) -> Self {
364 self.config.hash_method = method;
365 self
366 }
367}
368
369impl Default for HashEncoder<Untrained> {
370 fn default() -> Self {
371 Self::new()
372 }
373}
374
375/// Frequency encoder configuration
376#[derive(Debug, Clone)]
377#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
378pub struct FrequencyEncoderConfig {
379 /// Whether to normalize frequencies to probabilities
380 pub normalize: bool,
381 /// Strategy for handling rare categories
382 pub rare_strategy: RareStrategy,
383 /// Threshold for considering categories as rare
384 pub rare_threshold: usize,
385}
386
387impl Default for FrequencyEncoderConfig {
388 fn default() -> Self {
389 Self {
390 normalize: false,
391 rare_strategy: RareStrategy::Keep,
392 rare_threshold: 1,
393 }
394 }
395}
396
397/// Strategy for handling rare categories
398#[derive(Debug, Clone, PartialEq, Eq)]
399#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
400pub enum RareStrategy {
401 /// Keep rare categories as-is
402 Keep,
403 /// Group rare categories together
404 Group,
405 /// Replace rare categories with mean frequency
406 MeanFrequency,
407}
408
409/// Frequency encoder transforms categories to their occurrence frequencies
410pub struct FrequencyEncoder<State = Untrained> {
411 config: FrequencyEncoderConfig,
412 state: PhantomData<State>,
413}
414
415impl FrequencyEncoder<Untrained> {
416 /// Create a new FrequencyEncoder
417 pub fn new() -> Self {
418 Self {
419 config: FrequencyEncoderConfig::default(),
420 state: PhantomData,
421 }
422 }
423
424 /// Set whether to normalize frequencies
425 pub fn normalize(mut self, normalize: bool) -> Self {
426 self.config.normalize = normalize;
427 self
428 }
429
430 /// Set the rare category strategy
431 pub fn rare_strategy(mut self, strategy: RareStrategy) -> Self {
432 self.config.rare_strategy = strategy;
433 self
434 }
435}
436
437impl Default for FrequencyEncoder<Untrained> {
438 fn default() -> Self {
439 Self::new()
440 }
441}
442
443/// Configuration for CategoricalEmbedding
444#[derive(Debug, Clone)]
445#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
446pub struct CategoricalEmbeddingConfig {
447 /// Embedding dimension
448 pub embedding_dim: usize,
449 /// Learning rate for training
450 pub learning_rate: Float,
451 /// Number of training epochs
452 pub epochs: usize,
453 /// Batch size for training
454 pub batch_size: usize,
455}
456
457impl Default for CategoricalEmbeddingConfig {
458 fn default() -> Self {
459 Self {
460 embedding_dim: 50,
461 learning_rate: 0.01,
462 epochs: 100,
463 batch_size: 32,
464 }
465 }
466}
467
468/// Categorical embedding using neural network-style embeddings
469pub struct CategoricalEmbedding<State = Untrained> {
470 config: CategoricalEmbeddingConfig,
471 state: PhantomData<State>,
472}
473
474impl CategoricalEmbedding<Untrained> {
475 /// Create a new CategoricalEmbedding
476 pub fn new() -> Self {
477 Self {
478 config: CategoricalEmbeddingConfig::default(),
479 state: PhantomData,
480 }
481 }
482
483 /// Set the embedding dimension
484 pub fn embedding_dim(mut self, dim: usize) -> Self {
485 self.config.embedding_dim = dim;
486 self
487 }
488
489 /// Set the learning rate
490 pub fn learning_rate(mut self, lr: Float) -> Self {
491 self.config.learning_rate = lr;
492 self
493 }
494}
495
496impl Default for CategoricalEmbedding<Untrained> {
497 fn default() -> Self {
498 Self::new()
499 }
500}
501
502// Placeholder implementations for the basic encoders
503// These should be replaced with full implementations
504
505/// Label encoder for transforming categorical labels to integers
506#[derive(Debug, Clone, Default)]
507pub struct LabelEncoder {
508 // Placeholder - should implement proper label encoding
509}
510
511/// One-hot encoder for categorical features
512#[derive(Debug, Clone, Default)]
513pub struct OneHotEncoder {
514 // Placeholder - should implement proper one-hot encoding
515}
516
517/// Ordinal encoder for categorical features with inherent ordering
518#[derive(Debug, Clone, Default)]
519pub struct OrdinalEncoder {
520 // Placeholder - should implement proper ordinal encoding
521}
522
523/// Target encoder using target statistics for categorical encoding
524#[derive(Debug, Clone, Default)]
525pub struct TargetEncoder {
526 // Placeholder - should implement proper target encoding
527}