gbrt_rs/data/
preprocessing.rs

1//! Data Preprocessing for Machine Learning
2//! 
3//! This module provides preprocessing utilities for preparing raw data
4//! for gradient boosting models:
5//! 
6//! - **Feature Scaling**: Standardization and min-max normalization
7//! - **Categorical Encoding**: Convert string categories to numerical values
8//! - **Automatic Detection**: Identify categorical columns heuristically
9//! - **Preprocessor Pipeline**: Combine multiple preprocessing steps
10//! 
11//! # Scaling Strategies
12//! 
13//! - [`StandardScaler`]: Zero mean, unit variance (z-score normalization)
14//! - [`MinMaxScaler`]: Scale to specified range (e.g., [0, 1])
15//! - Custom scalers via [`Scaler`] trait
16//! 
17//! # Categorical Encoding
18//! 
19//! [`CategoricalEncoder`] maps string categories to numeric codes. Unknown
20//! categories are mapped to a default value (default: -1.0).
21//! 
22use super::{FeatureMatrix, DataError, DataResult};
23use ndarray::{Array1, Array2};
24use serde::{Serialize, Deserialize};
25use crate::utils::Statistics;
26use std::collections::{HashMap, HashSet};
27use thiserror::Error;
28
29/// Preprocessing-specific error types.
30#[derive(Error, Debug)]
31pub enum PreprocessingError {
32    /// Errors during categorical encoding (e.g., parsing failures).
33    #[error("Categorical encoding error: {0}")]
34    EncodingError(String),
35    
36    /// Invalid data format or consistency issues.
37    #[error("Invalid data: {0}")]
38    InvalidData(String),
39    
40    /// Scaler used before fitting.    
41    #[error("Scaler not fitted")]
42    NotFitted,
43}
44
45/// Trait for feature scaling operations.
46/// 
47/// Scalers learn parameters from training data (via [`fit()`]) and apply
48/// consistent transformations to new data (via [`transform()`]).
49/// 
50/// # Contract
51/// 
52/// - `fit()` must be called before `transform()`
53/// - Transformations must be deterministic and reproducible
54/// - Scaling parameters must be serializable for persistence
55pub trait Scaler: Send + Sync {
56    /// Learns scaling parameters from the training data.
57    /// 
58    /// # Parameters
59    /// - `data`: Training feature matrix
60    /// 
61    /// # Returns
62    /// `Ok(())` on successful fitting
63    /// 
64    /// # Errors
65    /// - `DataError::PreprocessingError` if fitting fails
66    fn fit(&mut self, data: &FeatureMatrix) -> DataResult<()>;
67    
68    /// Applies learned transformation to data.
69    /// 
70    /// # Parameters
71    /// - `data`: Feature matrix to transform
72    /// 
73    /// # Returns
74    /// Transformed feature matrix
75    /// 
76    /// # Errors
77    /// - `DataError::PreprocessingError` if scaler not fitted
78    /// - `DataError::PreprocessingError` if feature count mismatch
79    fn transform(&self, data: &FeatureMatrix) -> DataResult<FeatureMatrix>;
80    
81    /// Fits and transforms in a single operation.
82    /// 
83    /// Equivalent to calling `fit()` then `transform()`.
84    /// 
85    /// # Returns
86    /// Fitted and transformed feature matrix
87    fn fit_transform(&mut self, data: &FeatureMatrix) -> DataResult<FeatureMatrix> {
88        self.fit(data)?;
89        self.transform(data)
90    }
91}
92
93// ============================================================================
94// StandardScaler - standardize features by removing mean and scaling to unit variance
95// ============================================================================
96
97/// Standardizes features by removing the mean and scaling to unit variance.
98/// 
99/// # Formula
100/// 
101/// For each feature column `x`:
102/// 
103/// ```
104/// z = (x - μ) / σ
105/// ```
106/// 
107/// where:
108/// - `μ` is the mean of the feature
109/// - `σ` is the standard deviation (σ² + ε)¹ᐟ²
110/// - `ε` is a small constant for numerical stability
111/// 
112/// # When to Use
113/// 
114/// - When features have different scales
115/// - For algorithms sensitive to feature magnitude (not tree-based)
116/// - When you want zero-centered data
117/// 
118/// # Notes
119/// 
120/// Trees are scale-invariant, so scaling is optional for gradient boosting.
121/// However, it can help with regularization and numerical stability.
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct StandardScaler {
124    mean: Option<Array1<f64>>,
125    scale: Option<Array1<f64>>,
126    epsilon: f64,
127}
128
129impl Default for StandardScaler {
130    fn default() -> Self {
131        Self {
132            mean: None,
133            scale: None,
134            epsilon: 1e-8,
135        }
136    }
137}
138
139impl StandardScaler {
140    /// Creates a new unsc fitted StandardScaler with default epsilon.
141    pub fn new() -> Self {
142        Self::default()
143    }
144   
145    /// Creates a new StandardScaler with custom epsilon.
146    /// 
147    /// # Parameters
148    /// - `epsilon`: Small constant added to denominator for numerical stability.
149    ///   Set to 1e-8 by default. Increase for very small features.
150    pub fn with_epsilon(epsilon: f64) -> Self {
151        Self {
152            mean: None,
153            scale: None,
154            epsilon,
155        }
156    }
157
158    /// Returns true if the scaler has been fitted.
159    pub fn is_fitted(&self) -> bool {
160        self.mean.is_some() && self.scale.is_some()
161    }
162}
163
164impl Scaler for StandardScaler {
165    fn fit(&mut self, data: &FeatureMatrix) -> DataResult<()> {
166        let n_features = data.n_features();
167        
168        let mean: Array1<f64> = (0..n_features)
169            .map(|i| {
170                let feature = data.get_feature(i).unwrap();
171                feature.mean().unwrap_or(0.0)
172            })
173            .collect();
174            
175        let scale: Array1<f64> = (0..n_features)
176            .map(|i| {
177                let feature = data.get_feature(i).unwrap();
178                let variance = feature.variance().ok().unwrap_or(0.0);
179                (variance + self.epsilon).sqrt()
180            })
181            .collect();
182            
183        self.mean = Some(mean);
184        self.scale = Some(scale);
185        
186        Ok(())
187    }
188    
189    fn transform(&self, data: &FeatureMatrix) -> DataResult<FeatureMatrix> {
190        let mean = self.mean.as_ref()
191            .ok_or_else(|| DataError::PreprocessingError("Scaler not fitted".to_string()))?;
192        let scale = self.scale.as_ref()
193            .ok_or_else(|| DataError::PreprocessingError("Scaler not fitted".to_string()))?;
194            
195        if data.n_features() != mean.len() {
196            return Err(DataError::PreprocessingError(
197                format!("Expected {} features, got {}", mean.len(), data.n_features())
198            ));
199        }
200        
201        let mut transformed_data = data.data().clone();
202        
203        for i in 0..data.n_features() {
204            let mut column = transformed_data.column_mut(i);
205            for j in 0..column.len() {
206                column[j] = (column[j] - mean[i]) / scale[i];
207            }
208        }
209        
210        FeatureMatrix::with_feature_names(
211            transformed_data,
212            data.feature_names().to_vec()
213        ).map_err(Into::into)
214    }
215}
216
217// ============================================================================
218// MinMaxScaler - scale features to a given range (default [0, 1])
219// ============================================================================
220
221/// Scales features to a given range (default [0, 1]).
222///
223/// # Formula
224///
225/// For each feature column `x`:
226///
227/// ```
228/// z = (x - x_min) / (x_max - x_min) * (max_range - min_range) + min_range
229/// ```
230///
231/// where:
232/// - `x_min` is the minimum value seen during fitting
233/// - `x_max` is the maximum value seen during fitting
234/// - `max_range` and `min_range` are the target range bounds
235///
236/// # When to Use
237///
238/// - When you need bounded feature ranges
239/// - For neural networks (input normalization)
240/// - When preserving zero values is important
241/// - For visualization purposes
242///
243/// # Handling Constant Features
244///
245/// If a feature has zero range (all values identical), the scaler
246/// sets the scale factor to 1.0 to avoid division by zero.
247#[derive(Debug, Clone, Serialize, Deserialize)]
248pub struct MinMaxScaler {
249    min: Option<Array1<f64>>,
250    scale: Option<Array1<f64>>,
251    feature_range: (f64, f64),
252}
253
254impl Default for MinMaxScaler {
255    fn default() -> Self {
256        Self {
257            min: None,
258            scale: None,
259            feature_range: (0.0, 1.0),
260        }
261    }
262}
263
264impl MinMaxScaler {
265    /// Creates a new unsc fitted MinMaxScaler with default [0, 1] range.
266    pub fn new() -> Self {
267        Self::default()
268    }
269   
270    /// Creates a new MinMaxScaler with custom output range.
271    /// 
272    /// # Parameters
273    /// - `min`: Lower bound of output range
274    /// - `max`: Upper bound of output range
275    /// 
276    /// # Panics
277    /// Will cause transformation errors if `min >= max`.
278    pub fn with_range(min: f64, max: f64) -> Self {
279        Self {
280            min: None,
281            scale: None,
282            feature_range: (min, max),
283        }
284    }
285   
286    /// Returns true if the scaler has been fitted.
287    pub fn is_fitted(&self) -> bool {
288        self.min.is_some() && self.scale.is_some()
289    }
290}
291
292impl Scaler for MinMaxScaler {
293    fn fit(&mut self, data: &FeatureMatrix) -> DataResult<()> {
294        // Get min values for each feature using fold_axis
295        let data_min = data.data().fold_axis(
296            ndarray::Axis(0),
297            f64::INFINITY,
298            |&min, &x| min.min(x)
299        );
300        
301        // Get max values for each feature using fold_axis
302        let data_max = data.data().fold_axis(
303            ndarray::Axis(0),
304            f64::NEG_INFINITY,
305            |&max, &x| max.max(x)
306        );
307        
308        // Calculate range (max - min) for each feature
309        let data_range = &data_max - &data_min;
310        
311        // Handle constant features to avoid division by zero
312        let scale: Array1<f64> = data_range.iter()
313            .map(|&range| {
314                if range == 0.0 {
315                    1.0
316                } else {
317                    (self.feature_range.1 - self.feature_range.0) / range
318                }
319            })
320            .collect();
321            
322        self.min = Some(data_min);
323        self.scale = Some(scale);
324        
325        Ok(())
326    }
327    
328    fn transform(&self, data: &FeatureMatrix) -> DataResult<FeatureMatrix> {
329        let data_min = self.min.as_ref()
330            .ok_or_else(|| DataError::PreprocessingError("Scaler not fitted".to_string()))?;
331        let scale = self.scale.as_ref()
332            .ok_or_else(|| DataError::PreprocessingError("Scaler not fitted".to_string()))?;
333            
334        if data.n_features() != data_min.len() {
335            return Err(DataError::PreprocessingError(
336                format!("Expected {} features, got {}", data_min.len(), data.n_features())
337            ));
338        }
339        
340        let mut transformed_data = data.data().clone();
341        
342        for i in 0..data.n_features() {
343            let mut column = transformed_data.column_mut(i);
344            for j in 0..column.len() {
345                column[j] = (column[j] - data_min[i]) * scale[i] + self.feature_range.0;
346            }
347        }
348        
349        FeatureMatrix::with_feature_names(
350            transformed_data,
351            data.feature_names().to_vec()
352        ).map_err(Into::into)
353    }
354}
355
356// ============================================================================
357// CategoricalEncoder - encode string categories as numerical values
358// ============================================================================
359
360/// Encodes string categorical features into numerical values.
361/// 
362/// # Encoding Strategy
363/// 
364/// - Each unique string gets an integer code starting from 0
365/// - Unknown categories (not seen during fitting) get a default value
366/// - Numerical columns (parseable as f64) are passed through unchanged
367/// 
368/// # Usage Pattern
369/// 
370/// 1. `fit_transform()`: Learn mappings and encode training data
371/// 2. `transform()`: Encode new data using existing mappings
372/// 
373/// # Unknown Categories
374/// 
375/// Categories not present during fitting are mapped to `default_value`
376/// (default: -1.0). This prevents errors but may affect model performance.
377#[derive(Debug, Clone, Serialize, Deserialize)]
378pub struct CategoricalEncoder {
379    /// Mapping: column_index -> (category_string -> numeric_code)
380    mappings: HashMap<String, HashMap<String, f64>>,
381    /// Value for unknown categories
382    default_value: f64,
383}
384
385impl Default for CategoricalEncoder {
386    fn default() -> Self {
387        Self::new()
388    }
389}
390
391impl CategoricalEncoder {
392    /// Creates a new encoder with default value -1.0 for unknown categories.
393    pub fn new() -> Self {
394        Self {
395            mappings: HashMap::new(),
396            default_value: -1.0, // Use -1.0 for unknown categories
397        }
398    }
399    
400    /// Creates a new encoder with custom default value.
401    /// 
402    /// # Parameters
403    /// - `default_value`: Value to use for categories not seen during fitting
404    pub fn with_default(default_value: f64) -> Self {
405        Self {
406            mappings: HashMap::new(),
407            default_value,
408        }
409    }
410    
411
412    /// Fits encoder to data and transforms it in one pass.
413    /// 
414    /// # Parameters
415    /// - `data`: Dataset as strings (CSV-like structure)
416    /// - `categorical_columns`: Column indices to treat as categorical
417    /// 
418    /// # Returns
419    /// Encoded numerical data
420    /// 
421    /// # Errors
422    /// - `PreprocessingError::InvalidData` if data is empty or inconsistent
423    /// - `PreprocessingError::EncodingError` if numerical parsing fails
424    pub fn fit_transform(
425        &mut self, 
426        data: &[Vec<String>], 
427        categorical_columns: &[usize]
428    ) -> Result<Vec<Vec<f64>>, PreprocessingError> {
429        if data.is_empty() {
430            return Err(PreprocessingError::InvalidData("Empty dataset".to_string()));
431        }
432        
433        let n_features = data[0].len();
434        let n_samples = data.len();
435        let mut encoded_data = Vec::with_capacity(n_samples);
436        
437        // Validate categorical column indices
438        for &col_idx in categorical_columns {
439            if col_idx >= n_features {
440                return Err(PreprocessingError::InvalidData(
441                    format!("Column index {} out of bounds (max: {})", col_idx, n_features - 1)
442                ));
443            }
444        }
445        
446        // Initialize mappings for each categorical column
447        for &col_idx in categorical_columns {
448            self.mappings.insert(col_idx.to_string(), HashMap::new());
449        }
450        
451        // First pass: collect unique values for each categorical column
452        for &col_idx in categorical_columns {
453            let mut unique_values = Vec::new();
454            for row in data {
455                let value = &row[col_idx];
456                if !unique_values.contains(value) {
457                    unique_values.push(value.clone());
458                }
459            }
460            
461            // Sort for consistent encoding
462            unique_values.sort();
463            
464            // Create mapping (value -> index)
465            let mapping = self.mappings.get_mut(&col_idx.to_string()).unwrap();
466            for (idx, value) in unique_values.iter().enumerate() {
467                mapping.insert(value.clone(), idx as f64);
468            }
469        }
470        
471        // Second pass: encode data
472        for row in data {
473            let mut encoded_row = Vec::with_capacity(n_features);
474            
475            for (col_idx, value) in row.iter().enumerate() {
476                if self.mappings.contains_key(&col_idx.to_string()) {
477                    // Categorical column - use encoding
478                    let mapping = self.mappings.get(&col_idx.to_string()).unwrap();
479                    let encoded_value = mapping.get(value).unwrap_or(&self.default_value);
480                    encoded_row.push(*encoded_value);
481                } else {
482                    // Numerical column - try to parse as f64
483                    match value.parse::<f64>() {
484                        Ok(num) => encoded_row.push(num),
485                        Err(_) => {
486                            return Err(PreprocessingError::EncodingError(
487                                format!("Failed to parse numerical value '{}' in column {}", value, col_idx)
488                            ));
489                        }
490                    }
491                }
492            }
493            
494            encoded_data.push(encoded_row);
495        }
496        
497        Ok(encoded_data)
498    }
499    
500    /// Transforms new data using existing mappings.
501    /// 
502    /// # Parameters
503    /// - `data`: New dataset as strings
504    /// 
505    /// # Returns
506    /// Encoded numerical data
507    /// 
508    /// # Errors
509    /// - `PreprocessingError::InvalidData` if row lengths are inconsistent
510    /// - `PreprocessingError::EncodingError` if numerical parsing fails
511    /// 
512    /// # Unknown Categories
513    /// Categories not seen during fitting are mapped to `default_value`.
514    pub fn transform(&self, data: &[Vec<String>]) -> Result<Vec<Vec<f64>>, PreprocessingError> {
515        if data.is_empty() {
516            return Ok(Vec::new());
517        }
518        
519        let n_features = data[0].len();
520        let mut encoded_data = Vec::with_capacity(data.len());
521        
522        for row in data {
523            if row.len() != n_features {
524                return Err(PreprocessingError::InvalidData(
525                    format!("Inconsistent row length: expected {}, got {}", n_features, row.len())
526                ));
527            }
528            
529            let mut encoded_row = Vec::with_capacity(n_features);
530            
531            for (col_idx, value) in row.iter().enumerate() {
532                if let Some(mapping) = self.mappings.get(&col_idx.to_string()) {
533                    // Categorical column - use existing mapping
534                    let encoded_value = mapping.get(value).unwrap_or(&self.default_value);
535                    encoded_row.push(*encoded_value);
536                } else {
537                    // Numerical column - try to parse as f64
538                    match value.parse::<f64>() {
539                        Ok(num) => encoded_row.push(num),
540                        Err(_) => {
541                            return Err(PreprocessingError::EncodingError(
542                                format!("Failed to parse numerical value '{}' in column {}", value, col_idx)
543                            ));
544                        }
545                    }
546                }
547            }
548            
549            encoded_data.push(encoded_row);
550        }
551        
552        Ok(encoded_data)
553    }
554    
555    /// Gets the mapping for a specific column.
556    /// 
557    /// # Parameters
558    /// - `column`: Column index
559    /// 
560    /// # Returns
561    /// `Some(&HashMap)` if column was fitted as categorical, `None` otherwise
562    pub fn get_mapping(&self, column: usize) -> Option<&HashMap<String, f64>> {
563        self.mappings.get(&column.to_string())
564    }
565    
566    /// Gets the number of unique categories for a column.
567    /// 
568    /// # Parameters
569    /// - `column`: Column index
570    /// 
571    /// # Returns
572    /// `Some(count)` if column was fitted as categorical, `None` otherwise
573    pub fn n_categories(&self, column: usize) -> Option<usize> {
574        self.get_mapping(column).map(|m| m.len())
575    }
576}
577
578// ============================================================================
579// Helper Functions
580// ============================================================================
581
582/// Automatically detects categorical columns based on data characteristics.
583/// 
584/// Uses two heuristics:
585/// 1. **Data type**: Columns containing non-numeric values are categorical
586/// 2. **Cardinality**: Numeric columns with low uniqueness ratio (< threshold) are categorical
587/// 
588/// # Parameters
589/// - `data`: Dataset as strings (CSV-like structure)
590/// - `threshold`: Ratio threshold (unique_values / total_samples). Default: 0.1
591/// 
592/// # Returns
593/// Vector of column indices identified as categorical
594pub fn detect_categorical_columns(data: &[Vec<String>], threshold: f64) -> Vec<usize> {
595    if data.is_empty() {
596        return Vec::new();
597    }
598    
599    let n_samples = data.len();
600    let n_features = data[0].len();
601    let mut categorical_columns = Vec::new();
602    
603    for col_idx in 0..n_features {
604        let mut unique_values = HashSet::new();
605        let mut is_numeric = true;
606        
607        for row in data {
608            let value = &row[col_idx];
609            
610            // Check if value can be parsed as f64
611            if value.parse::<f64>().is_err() {
612                is_numeric = false;
613            }
614            
615            unique_values.insert(value.clone());
616        }
617        
618        // If not numeric OR if numeric but has few unique values (could be categorical codes)
619        let uniqueness_ratio = unique_values.len() as f64 / n_samples as f64;
620        if !is_numeric || uniqueness_ratio < threshold {
621            categorical_columns.push(col_idx);
622        }
623    }
624    
625    categorical_columns
626}
627
628
629
630// ============================================================================
631// DataPreprocessor - Combine multiple preprocessing steps
632// ============================================================================
633
634/// High-level preprocessor that combines categorical encoding and scaling.
635/// 
636/// This convenience struct orchestrates multiple preprocessing steps:
637/// 1. Categorical encoding (if encoder provided)
638/// 2. Numerical scaling (if scaler provided)
639/// 
640pub struct DataPreprocessor {
641    categorical_encoder: Option<CategoricalEncoder>,
642    scaler: Option<Box<dyn Scaler>>,
643}
644
645impl DataPreprocessor {
646    /// Creates a new preprocessor with no steps configured.   
647    pub fn new() -> Self {
648        Self {
649            categorical_encoder: None,
650            scaler: None,
651        }
652    }
653   
654    /// Adds categorical encoding step.
655    /// 
656    /// # Parameters
657    /// - `encoder`: Categorical encoder instance
658    /// 
659    /// # Returns
660    /// Self for method chaining
661    pub fn with_categorical_encoder(mut self, encoder: CategoricalEncoder) -> Self {
662        self.categorical_encoder = Some(encoder);
663        self
664    }
665   
666    /// Adds scaling step.
667    /// 
668    /// # Parameters
669    /// - `scaler`: Boxed scaler trait object (e.g., Box::new(StandardScaler::new()))
670    /// 
671    /// # Returns
672    /// Self for method chaining
673    pub fn with_scaler(mut self, scaler: Box<dyn Scaler>) -> Self {
674        self.scaler = Some(scaler);
675        self
676    }
677    
678    /// Fits encoder (if any) and transforms data, then applies scaler (if any).
679    /// 
680    /// # Parameters
681    /// - `data`: Raw string data (CSV-like)
682    /// - `categorical_columns`: Column indices to treat as categorical
683    /// 
684    /// # Returns
685    /// Preprocessed feature matrix
686    /// 
687    /// # Errors
688    /// - `PreprocessingError::InvalidData` if encoding fails
689    /// - `PreprocessingError::EncodingError` if numerical parsing fails
690    /// - `DataError` if scaler fitting/transforming fails
691    pub fn fit_transform(
692        &mut self,
693        data: &[Vec<String>],
694        categorical_columns: &[usize],
695    ) -> Result<FeatureMatrix, PreprocessingError> {
696        // Encode categorical data
697        let mut encoder = CategoricalEncoder::new();
698        let encoded_data = encoder.fit_transform(data, categorical_columns)?;
699        
700        if self.categorical_encoder.is_none() {
701            self.categorical_encoder = Some(encoder);
702        }
703        
704        // Convert to FeatureMatrix
705        let n_samples = encoded_data.len();
706        let n_features = encoded_data[0].len();
707        
708        let flat_data: Vec<f64> = encoded_data.into_iter().flatten().collect();
709        let array = Array2::from_shape_vec((n_samples, n_features), flat_data)
710            .map_err(|e| PreprocessingError::InvalidData(e.to_string()))?;
711        
712        let feature_matrix = FeatureMatrix::new(array)
713            .map_err(|e| PreprocessingError::InvalidData(e.to_string()))?;
714        
715        // Apply scaler if specified
716        if let Some(scaler) = &mut self.scaler {
717            scaler.fit(&feature_matrix)
718                .map_err(|e| PreprocessingError::InvalidData(e.to_string()))?;
719            return scaler.transform(&feature_matrix)
720                .map_err(|e| PreprocessingError::InvalidData(e.to_string()));
721        }
722        
723        Ok(feature_matrix)
724    }
725}