gbrt_rs/data/preprocessing.rs
1//! Data Preprocessing for Machine Learning
2//!
3//! This module provides preprocessing utilities for preparing raw data
4//! for gradient boosting models:
5//!
6//! - **Feature Scaling**: Standardization and min-max normalization
7//! - **Categorical Encoding**: Convert string categories to numerical values
8//! - **Automatic Detection**: Identify categorical columns heuristically
9//! - **Preprocessor Pipeline**: Combine multiple preprocessing steps
10//!
11//! # Scaling Strategies
12//!
13//! - [`StandardScaler`]: Zero mean, unit variance (z-score normalization)
14//! - [`MinMaxScaler`]: Scale to specified range (e.g., [0, 1])
15//! - Custom scalers via [`Scaler`] trait
16//!
17//! # Categorical Encoding
18//!
19//! [`CategoricalEncoder`] maps string categories to numeric codes. Unknown
20//! categories are mapped to a default value (default: -1.0).
21//!
22use super::{FeatureMatrix, DataError, DataResult};
23use ndarray::{Array1, Array2};
24use serde::{Serialize, Deserialize};
25use crate::utils::Statistics;
26use std::collections::{HashMap, HashSet};
27use thiserror::Error;
28
29/// Preprocessing-specific error types.
30#[derive(Error, Debug)]
31pub enum PreprocessingError {
32 /// Errors during categorical encoding (e.g., parsing failures).
33 #[error("Categorical encoding error: {0}")]
34 EncodingError(String),
35
36 /// Invalid data format or consistency issues.
37 #[error("Invalid data: {0}")]
38 InvalidData(String),
39
40 /// Scaler used before fitting.
41 #[error("Scaler not fitted")]
42 NotFitted,
43}
44
45/// Trait for feature scaling operations.
46///
47/// Scalers learn parameters from training data (via [`fit()`]) and apply
48/// consistent transformations to new data (via [`transform()`]).
49///
50/// # Contract
51///
52/// - `fit()` must be called before `transform()`
53/// - Transformations must be deterministic and reproducible
54/// - Scaling parameters must be serializable for persistence
55pub trait Scaler: Send + Sync {
56 /// Learns scaling parameters from the training data.
57 ///
58 /// # Parameters
59 /// - `data`: Training feature matrix
60 ///
61 /// # Returns
62 /// `Ok(())` on successful fitting
63 ///
64 /// # Errors
65 /// - `DataError::PreprocessingError` if fitting fails
66 fn fit(&mut self, data: &FeatureMatrix) -> DataResult<()>;
67
68 /// Applies learned transformation to data.
69 ///
70 /// # Parameters
71 /// - `data`: Feature matrix to transform
72 ///
73 /// # Returns
74 /// Transformed feature matrix
75 ///
76 /// # Errors
77 /// - `DataError::PreprocessingError` if scaler not fitted
78 /// - `DataError::PreprocessingError` if feature count mismatch
79 fn transform(&self, data: &FeatureMatrix) -> DataResult<FeatureMatrix>;
80
81 /// Fits and transforms in a single operation.
82 ///
83 /// Equivalent to calling `fit()` then `transform()`.
84 ///
85 /// # Returns
86 /// Fitted and transformed feature matrix
87 fn fit_transform(&mut self, data: &FeatureMatrix) -> DataResult<FeatureMatrix> {
88 self.fit(data)?;
89 self.transform(data)
90 }
91}
92
93// ============================================================================
94// StandardScaler - standardize features by removing mean and scaling to unit variance
95// ============================================================================
96
97/// Standardizes features by removing the mean and scaling to unit variance.
98///
99/// # Formula
100///
101/// For each feature column `x`:
102///
103/// ```
104/// z = (x - μ) / σ
105/// ```
106///
107/// where:
108/// - `μ` is the mean of the feature
109/// - `σ` is the standard deviation (σ² + ε)¹ᐟ²
110/// - `ε` is a small constant for numerical stability
111///
112/// # When to Use
113///
114/// - When features have different scales
115/// - For algorithms sensitive to feature magnitude (not tree-based)
116/// - When you want zero-centered data
117///
118/// # Notes
119///
120/// Trees are scale-invariant, so scaling is optional for gradient boosting.
121/// However, it can help with regularization and numerical stability.
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct StandardScaler {
124 mean: Option<Array1<f64>>,
125 scale: Option<Array1<f64>>,
126 epsilon: f64,
127}
128
129impl Default for StandardScaler {
130 fn default() -> Self {
131 Self {
132 mean: None,
133 scale: None,
134 epsilon: 1e-8,
135 }
136 }
137}
138
139impl StandardScaler {
140 /// Creates a new unsc fitted StandardScaler with default epsilon.
141 pub fn new() -> Self {
142 Self::default()
143 }
144
145 /// Creates a new StandardScaler with custom epsilon.
146 ///
147 /// # Parameters
148 /// - `epsilon`: Small constant added to denominator for numerical stability.
149 /// Set to 1e-8 by default. Increase for very small features.
150 pub fn with_epsilon(epsilon: f64) -> Self {
151 Self {
152 mean: None,
153 scale: None,
154 epsilon,
155 }
156 }
157
158 /// Returns true if the scaler has been fitted.
159 pub fn is_fitted(&self) -> bool {
160 self.mean.is_some() && self.scale.is_some()
161 }
162}
163
164impl Scaler for StandardScaler {
165 fn fit(&mut self, data: &FeatureMatrix) -> DataResult<()> {
166 let n_features = data.n_features();
167
168 let mean: Array1<f64> = (0..n_features)
169 .map(|i| {
170 let feature = data.get_feature(i).unwrap();
171 feature.mean().unwrap_or(0.0)
172 })
173 .collect();
174
175 let scale: Array1<f64> = (0..n_features)
176 .map(|i| {
177 let feature = data.get_feature(i).unwrap();
178 let variance = feature.variance().ok().unwrap_or(0.0);
179 (variance + self.epsilon).sqrt()
180 })
181 .collect();
182
183 self.mean = Some(mean);
184 self.scale = Some(scale);
185
186 Ok(())
187 }
188
189 fn transform(&self, data: &FeatureMatrix) -> DataResult<FeatureMatrix> {
190 let mean = self.mean.as_ref()
191 .ok_or_else(|| DataError::PreprocessingError("Scaler not fitted".to_string()))?;
192 let scale = self.scale.as_ref()
193 .ok_or_else(|| DataError::PreprocessingError("Scaler not fitted".to_string()))?;
194
195 if data.n_features() != mean.len() {
196 return Err(DataError::PreprocessingError(
197 format!("Expected {} features, got {}", mean.len(), data.n_features())
198 ));
199 }
200
201 let mut transformed_data = data.data().clone();
202
203 for i in 0..data.n_features() {
204 let mut column = transformed_data.column_mut(i);
205 for j in 0..column.len() {
206 column[j] = (column[j] - mean[i]) / scale[i];
207 }
208 }
209
210 FeatureMatrix::with_feature_names(
211 transformed_data,
212 data.feature_names().to_vec()
213 ).map_err(Into::into)
214 }
215}
216
217// ============================================================================
218// MinMaxScaler - scale features to a given range (default [0, 1])
219// ============================================================================
220
221/// Scales features to a given range (default [0, 1]).
222///
223/// # Formula
224///
225/// For each feature column `x`:
226///
227/// ```
228/// z = (x - x_min) / (x_max - x_min) * (max_range - min_range) + min_range
229/// ```
230///
231/// where:
232/// - `x_min` is the minimum value seen during fitting
233/// - `x_max` is the maximum value seen during fitting
234/// - `max_range` and `min_range` are the target range bounds
235///
236/// # When to Use
237///
238/// - When you need bounded feature ranges
239/// - For neural networks (input normalization)
240/// - When preserving zero values is important
241/// - For visualization purposes
242///
243/// # Handling Constant Features
244///
245/// If a feature has zero range (all values identical), the scaler
246/// sets the scale factor to 1.0 to avoid division by zero.
247#[derive(Debug, Clone, Serialize, Deserialize)]
248pub struct MinMaxScaler {
249 min: Option<Array1<f64>>,
250 scale: Option<Array1<f64>>,
251 feature_range: (f64, f64),
252}
253
254impl Default for MinMaxScaler {
255 fn default() -> Self {
256 Self {
257 min: None,
258 scale: None,
259 feature_range: (0.0, 1.0),
260 }
261 }
262}
263
264impl MinMaxScaler {
265 /// Creates a new unsc fitted MinMaxScaler with default [0, 1] range.
266 pub fn new() -> Self {
267 Self::default()
268 }
269
270 /// Creates a new MinMaxScaler with custom output range.
271 ///
272 /// # Parameters
273 /// - `min`: Lower bound of output range
274 /// - `max`: Upper bound of output range
275 ///
276 /// # Panics
277 /// Will cause transformation errors if `min >= max`.
278 pub fn with_range(min: f64, max: f64) -> Self {
279 Self {
280 min: None,
281 scale: None,
282 feature_range: (min, max),
283 }
284 }
285
286 /// Returns true if the scaler has been fitted.
287 pub fn is_fitted(&self) -> bool {
288 self.min.is_some() && self.scale.is_some()
289 }
290}
291
292impl Scaler for MinMaxScaler {
293 fn fit(&mut self, data: &FeatureMatrix) -> DataResult<()> {
294 // Get min values for each feature using fold_axis
295 let data_min = data.data().fold_axis(
296 ndarray::Axis(0),
297 f64::INFINITY,
298 |&min, &x| min.min(x)
299 );
300
301 // Get max values for each feature using fold_axis
302 let data_max = data.data().fold_axis(
303 ndarray::Axis(0),
304 f64::NEG_INFINITY,
305 |&max, &x| max.max(x)
306 );
307
308 // Calculate range (max - min) for each feature
309 let data_range = &data_max - &data_min;
310
311 // Handle constant features to avoid division by zero
312 let scale: Array1<f64> = data_range.iter()
313 .map(|&range| {
314 if range == 0.0 {
315 1.0
316 } else {
317 (self.feature_range.1 - self.feature_range.0) / range
318 }
319 })
320 .collect();
321
322 self.min = Some(data_min);
323 self.scale = Some(scale);
324
325 Ok(())
326 }
327
328 fn transform(&self, data: &FeatureMatrix) -> DataResult<FeatureMatrix> {
329 let data_min = self.min.as_ref()
330 .ok_or_else(|| DataError::PreprocessingError("Scaler not fitted".to_string()))?;
331 let scale = self.scale.as_ref()
332 .ok_or_else(|| DataError::PreprocessingError("Scaler not fitted".to_string()))?;
333
334 if data.n_features() != data_min.len() {
335 return Err(DataError::PreprocessingError(
336 format!("Expected {} features, got {}", data_min.len(), data.n_features())
337 ));
338 }
339
340 let mut transformed_data = data.data().clone();
341
342 for i in 0..data.n_features() {
343 let mut column = transformed_data.column_mut(i);
344 for j in 0..column.len() {
345 column[j] = (column[j] - data_min[i]) * scale[i] + self.feature_range.0;
346 }
347 }
348
349 FeatureMatrix::with_feature_names(
350 transformed_data,
351 data.feature_names().to_vec()
352 ).map_err(Into::into)
353 }
354}
355
356// ============================================================================
357// CategoricalEncoder - encode string categories as numerical values
358// ============================================================================
359
360/// Encodes string categorical features into numerical values.
361///
362/// # Encoding Strategy
363///
364/// - Each unique string gets an integer code starting from 0
365/// - Unknown categories (not seen during fitting) get a default value
366/// - Numerical columns (parseable as f64) are passed through unchanged
367///
368/// # Usage Pattern
369///
370/// 1. `fit_transform()`: Learn mappings and encode training data
371/// 2. `transform()`: Encode new data using existing mappings
372///
373/// # Unknown Categories
374///
375/// Categories not present during fitting are mapped to `default_value`
376/// (default: -1.0). This prevents errors but may affect model performance.
377#[derive(Debug, Clone, Serialize, Deserialize)]
378pub struct CategoricalEncoder {
379 /// Mapping: column_index -> (category_string -> numeric_code)
380 mappings: HashMap<String, HashMap<String, f64>>,
381 /// Value for unknown categories
382 default_value: f64,
383}
384
385impl Default for CategoricalEncoder {
386 fn default() -> Self {
387 Self::new()
388 }
389}
390
391impl CategoricalEncoder {
392 /// Creates a new encoder with default value -1.0 for unknown categories.
393 pub fn new() -> Self {
394 Self {
395 mappings: HashMap::new(),
396 default_value: -1.0, // Use -1.0 for unknown categories
397 }
398 }
399
400 /// Creates a new encoder with custom default value.
401 ///
402 /// # Parameters
403 /// - `default_value`: Value to use for categories not seen during fitting
404 pub fn with_default(default_value: f64) -> Self {
405 Self {
406 mappings: HashMap::new(),
407 default_value,
408 }
409 }
410
411
412 /// Fits encoder to data and transforms it in one pass.
413 ///
414 /// # Parameters
415 /// - `data`: Dataset as strings (CSV-like structure)
416 /// - `categorical_columns`: Column indices to treat as categorical
417 ///
418 /// # Returns
419 /// Encoded numerical data
420 ///
421 /// # Errors
422 /// - `PreprocessingError::InvalidData` if data is empty or inconsistent
423 /// - `PreprocessingError::EncodingError` if numerical parsing fails
424 pub fn fit_transform(
425 &mut self,
426 data: &[Vec<String>],
427 categorical_columns: &[usize]
428 ) -> Result<Vec<Vec<f64>>, PreprocessingError> {
429 if data.is_empty() {
430 return Err(PreprocessingError::InvalidData("Empty dataset".to_string()));
431 }
432
433 let n_features = data[0].len();
434 let n_samples = data.len();
435 let mut encoded_data = Vec::with_capacity(n_samples);
436
437 // Validate categorical column indices
438 for &col_idx in categorical_columns {
439 if col_idx >= n_features {
440 return Err(PreprocessingError::InvalidData(
441 format!("Column index {} out of bounds (max: {})", col_idx, n_features - 1)
442 ));
443 }
444 }
445
446 // Initialize mappings for each categorical column
447 for &col_idx in categorical_columns {
448 self.mappings.insert(col_idx.to_string(), HashMap::new());
449 }
450
451 // First pass: collect unique values for each categorical column
452 for &col_idx in categorical_columns {
453 let mut unique_values = Vec::new();
454 for row in data {
455 let value = &row[col_idx];
456 if !unique_values.contains(value) {
457 unique_values.push(value.clone());
458 }
459 }
460
461 // Sort for consistent encoding
462 unique_values.sort();
463
464 // Create mapping (value -> index)
465 let mapping = self.mappings.get_mut(&col_idx.to_string()).unwrap();
466 for (idx, value) in unique_values.iter().enumerate() {
467 mapping.insert(value.clone(), idx as f64);
468 }
469 }
470
471 // Second pass: encode data
472 for row in data {
473 let mut encoded_row = Vec::with_capacity(n_features);
474
475 for (col_idx, value) in row.iter().enumerate() {
476 if self.mappings.contains_key(&col_idx.to_string()) {
477 // Categorical column - use encoding
478 let mapping = self.mappings.get(&col_idx.to_string()).unwrap();
479 let encoded_value = mapping.get(value).unwrap_or(&self.default_value);
480 encoded_row.push(*encoded_value);
481 } else {
482 // Numerical column - try to parse as f64
483 match value.parse::<f64>() {
484 Ok(num) => encoded_row.push(num),
485 Err(_) => {
486 return Err(PreprocessingError::EncodingError(
487 format!("Failed to parse numerical value '{}' in column {}", value, col_idx)
488 ));
489 }
490 }
491 }
492 }
493
494 encoded_data.push(encoded_row);
495 }
496
497 Ok(encoded_data)
498 }
499
500 /// Transforms new data using existing mappings.
501 ///
502 /// # Parameters
503 /// - `data`: New dataset as strings
504 ///
505 /// # Returns
506 /// Encoded numerical data
507 ///
508 /// # Errors
509 /// - `PreprocessingError::InvalidData` if row lengths are inconsistent
510 /// - `PreprocessingError::EncodingError` if numerical parsing fails
511 ///
512 /// # Unknown Categories
513 /// Categories not seen during fitting are mapped to `default_value`.
514 pub fn transform(&self, data: &[Vec<String>]) -> Result<Vec<Vec<f64>>, PreprocessingError> {
515 if data.is_empty() {
516 return Ok(Vec::new());
517 }
518
519 let n_features = data[0].len();
520 let mut encoded_data = Vec::with_capacity(data.len());
521
522 for row in data {
523 if row.len() != n_features {
524 return Err(PreprocessingError::InvalidData(
525 format!("Inconsistent row length: expected {}, got {}", n_features, row.len())
526 ));
527 }
528
529 let mut encoded_row = Vec::with_capacity(n_features);
530
531 for (col_idx, value) in row.iter().enumerate() {
532 if let Some(mapping) = self.mappings.get(&col_idx.to_string()) {
533 // Categorical column - use existing mapping
534 let encoded_value = mapping.get(value).unwrap_or(&self.default_value);
535 encoded_row.push(*encoded_value);
536 } else {
537 // Numerical column - try to parse as f64
538 match value.parse::<f64>() {
539 Ok(num) => encoded_row.push(num),
540 Err(_) => {
541 return Err(PreprocessingError::EncodingError(
542 format!("Failed to parse numerical value '{}' in column {}", value, col_idx)
543 ));
544 }
545 }
546 }
547 }
548
549 encoded_data.push(encoded_row);
550 }
551
552 Ok(encoded_data)
553 }
554
555 /// Gets the mapping for a specific column.
556 ///
557 /// # Parameters
558 /// - `column`: Column index
559 ///
560 /// # Returns
561 /// `Some(&HashMap)` if column was fitted as categorical, `None` otherwise
562 pub fn get_mapping(&self, column: usize) -> Option<&HashMap<String, f64>> {
563 self.mappings.get(&column.to_string())
564 }
565
566 /// Gets the number of unique categories for a column.
567 ///
568 /// # Parameters
569 /// - `column`: Column index
570 ///
571 /// # Returns
572 /// `Some(count)` if column was fitted as categorical, `None` otherwise
573 pub fn n_categories(&self, column: usize) -> Option<usize> {
574 self.get_mapping(column).map(|m| m.len())
575 }
576}
577
578// ============================================================================
579// Helper Functions
580// ============================================================================
581
582/// Automatically detects categorical columns based on data characteristics.
583///
584/// Uses two heuristics:
585/// 1. **Data type**: Columns containing non-numeric values are categorical
586/// 2. **Cardinality**: Numeric columns with low uniqueness ratio (< threshold) are categorical
587///
588/// # Parameters
589/// - `data`: Dataset as strings (CSV-like structure)
590/// - `threshold`: Ratio threshold (unique_values / total_samples). Default: 0.1
591///
592/// # Returns
593/// Vector of column indices identified as categorical
594pub fn detect_categorical_columns(data: &[Vec<String>], threshold: f64) -> Vec<usize> {
595 if data.is_empty() {
596 return Vec::new();
597 }
598
599 let n_samples = data.len();
600 let n_features = data[0].len();
601 let mut categorical_columns = Vec::new();
602
603 for col_idx in 0..n_features {
604 let mut unique_values = HashSet::new();
605 let mut is_numeric = true;
606
607 for row in data {
608 let value = &row[col_idx];
609
610 // Check if value can be parsed as f64
611 if value.parse::<f64>().is_err() {
612 is_numeric = false;
613 }
614
615 unique_values.insert(value.clone());
616 }
617
618 // If not numeric OR if numeric but has few unique values (could be categorical codes)
619 let uniqueness_ratio = unique_values.len() as f64 / n_samples as f64;
620 if !is_numeric || uniqueness_ratio < threshold {
621 categorical_columns.push(col_idx);
622 }
623 }
624
625 categorical_columns
626}
627
628
629
630// ============================================================================
631// DataPreprocessor - Combine multiple preprocessing steps
632// ============================================================================
633
634/// High-level preprocessor that combines categorical encoding and scaling.
635///
636/// This convenience struct orchestrates multiple preprocessing steps:
637/// 1. Categorical encoding (if encoder provided)
638/// 2. Numerical scaling (if scaler provided)
639///
640pub struct DataPreprocessor {
641 categorical_encoder: Option<CategoricalEncoder>,
642 scaler: Option<Box<dyn Scaler>>,
643}
644
645impl DataPreprocessor {
646 /// Creates a new preprocessor with no steps configured.
647 pub fn new() -> Self {
648 Self {
649 categorical_encoder: None,
650 scaler: None,
651 }
652 }
653
654 /// Adds categorical encoding step.
655 ///
656 /// # Parameters
657 /// - `encoder`: Categorical encoder instance
658 ///
659 /// # Returns
660 /// Self for method chaining
661 pub fn with_categorical_encoder(mut self, encoder: CategoricalEncoder) -> Self {
662 self.categorical_encoder = Some(encoder);
663 self
664 }
665
666 /// Adds scaling step.
667 ///
668 /// # Parameters
669 /// - `scaler`: Boxed scaler trait object (e.g., Box::new(StandardScaler::new()))
670 ///
671 /// # Returns
672 /// Self for method chaining
673 pub fn with_scaler(mut self, scaler: Box<dyn Scaler>) -> Self {
674 self.scaler = Some(scaler);
675 self
676 }
677
678 /// Fits encoder (if any) and transforms data, then applies scaler (if any).
679 ///
680 /// # Parameters
681 /// - `data`: Raw string data (CSV-like)
682 /// - `categorical_columns`: Column indices to treat as categorical
683 ///
684 /// # Returns
685 /// Preprocessed feature matrix
686 ///
687 /// # Errors
688 /// - `PreprocessingError::InvalidData` if encoding fails
689 /// - `PreprocessingError::EncodingError` if numerical parsing fails
690 /// - `DataError` if scaler fitting/transforming fails
691 pub fn fit_transform(
692 &mut self,
693 data: &[Vec<String>],
694 categorical_columns: &[usize],
695 ) -> Result<FeatureMatrix, PreprocessingError> {
696 // Encode categorical data
697 let mut encoder = CategoricalEncoder::new();
698 let encoded_data = encoder.fit_transform(data, categorical_columns)?;
699
700 if self.categorical_encoder.is_none() {
701 self.categorical_encoder = Some(encoder);
702 }
703
704 // Convert to FeatureMatrix
705 let n_samples = encoded_data.len();
706 let n_features = encoded_data[0].len();
707
708 let flat_data: Vec<f64> = encoded_data.into_iter().flatten().collect();
709 let array = Array2::from_shape_vec((n_samples, n_features), flat_data)
710 .map_err(|e| PreprocessingError::InvalidData(e.to_string()))?;
711
712 let feature_matrix = FeatureMatrix::new(array)
713 .map_err(|e| PreprocessingError::InvalidData(e.to_string()))?;
714
715 // Apply scaler if specified
716 if let Some(scaler) = &mut self.scaler {
717 scaler.fit(&feature_matrix)
718 .map_err(|e| PreprocessingError::InvalidData(e.to_string()))?;
719 return scaler.transform(&feature_matrix)
720 .map_err(|e| PreprocessingError::InvalidData(e.to_string()));
721 }
722
723 Ok(feature_matrix)
724 }
725}