ferrolearn_preprocess/lib.rs
1//! # ferrolearn-preprocess
2//!
3//! Data preprocessing transformers for the ferrolearn machine learning framework.
4//!
5//! This crate provides standard scalers, encoders, imputers, and feature
6//! selection utilities that follow the ferrolearn `Fit`/`Transform` trait
7//! pattern.
8//!
9//! ## Scalers
10//!
11//! All scalers are generic over `F: Float + Send + Sync + 'static` and implement
12//! [`Fit<Array2<F>, ()>`](ferrolearn_core::Fit) (returning a `Fitted*` type) and
13//! [`FitTransform<Array2<F>>`](ferrolearn_core::FitTransform). The fitted types
14//! implement [`Transform<Array2<F>>`](ferrolearn_core::Transform).
15//!
16//! - [`StandardScaler`] — zero-mean, unit-variance scaling
17//! - [`MinMaxScaler`] — scale features to a given range (default `[0, 1]`)
18//! - [`RobustScaler`] — median / IQR-based scaling, robust to outliers
19//! - [`MaxAbsScaler`] — scale by maximum absolute value so values are in `[-1, 1]`
20//! - [`normalizer::Normalizer`] — normalize each sample (row) to unit norm
21//! - [`power_transformer::PowerTransformer`] — Yeo-Johnson power transform
22//!
23//! ## Encoders
24//!
25//! - [`OneHotEncoder`] — encode `Array2<usize>` categorical columns as binary columns
26//! - [`LabelEncoder`] — map `Array1<String>` labels to integer indices
27//! - [`ordinal_encoder::OrdinalEncoder`] — map string categories to integers in
28//! order of first appearance
29//!
30//! ## Imputers
31//!
32//! - [`imputer::SimpleImputer`] — fill missing (NaN) values per feature column
33//! using Mean, Median, MostFrequent, or Constant strategy.
34//!
35//! ## Feature Selection
36//!
37//! - [`feature_selection::VarianceThreshold`] — remove features with variance
38//! below a configurable threshold.
39//! - [`feature_selection::SelectKBest`] — keep the K features with the highest
40//! ANOVA F-scores against class labels.
41//! - [`feature_selection::SelectFromModel`] — keep features whose importance
42//! weight (from a pre-fitted model) meets a configurable threshold.
43//!
44//! ## Feature Engineering
45//!
46//! - [`polynomial_features::PolynomialFeatures`] — generate polynomial and interaction features
47//! - [`binarizer::Binarizer`] — threshold features to binary values
48//! - [`function_transformer::FunctionTransformer`] — apply a user-provided function element-wise
49//!
50//! ## Pipeline Integration
51//!
52//! `StandardScaler<f64>`, `MinMaxScaler<f64>`, `RobustScaler<f64>`,
53//! `MaxAbsScaler<f64>`, `Normalizer<f64>`, `PowerTransformer<f64>`,
54//! `PolynomialFeatures<f64>`, `SimpleImputer<f64>`, `VarianceThreshold<f64>`,
55//! `SelectKBest<f64>`, and `SelectFromModel<f64>` each implement
56//! [`PipelineTransformer`](ferrolearn_core::pipeline::PipelineTransformer)
57//! so they can be used as steps inside a
58//! [`Pipeline`](ferrolearn_core::pipeline::Pipeline).
59//!
60//! # Examples
61//!
62//! ```
63//! use ferrolearn_preprocess::StandardScaler;
64//! use ferrolearn_core::traits::FitTransform;
65//! use ndarray::array;
66//!
67//! let x = array![[1.0_f64, 10.0], [2.0, 20.0], [3.0, 30.0]];
68//! let scaled = StandardScaler::<f64>::new().fit_transform(&x).unwrap();
69//! // scaled columns now have mean ≈ 0 and std ≈ 1
70//! ```
71
72pub mod binarizer;
73pub mod binary_encoder;
74pub mod column_transformer;
75pub mod feature_selection;
76pub mod function_transformer;
77pub mod imputer;
78pub mod iterative_imputer;
79pub mod kbins_discretizer;
80pub mod knn_imputer;
81pub mod label_encoder;
82pub mod max_abs_scaler;
83pub mod min_max_scaler;
84pub mod normalizer;
85pub mod one_hot_encoder;
86pub mod ordinal_encoder;
87pub mod polynomial_features;
88pub mod power_transformer;
89pub mod quantile_transformer;
90pub mod rfe;
91pub mod robust_scaler;
92pub mod select_percentile;
93pub mod spline_transformer;
94pub mod standard_scaler;
95pub mod target_encoder;
96
97// Re-exports
98pub use binarizer::Binarizer;
99pub use column_transformer::{
100 ColumnSelector, ColumnTransformer, FittedColumnTransformer, Remainder, make_column_transformer,
101};
102pub use feature_selection::{
103 FittedSelectKBest, FittedVarianceThreshold, ScoreFunc, SelectFromModel, SelectKBest,
104 VarianceThreshold,
105};
106pub use function_transformer::FunctionTransformer;
107pub use imputer::{FittedSimpleImputer, ImputeStrategy, SimpleImputer};
108pub use label_encoder::{FittedLabelEncoder, LabelEncoder};
109pub use max_abs_scaler::{FittedMaxAbsScaler, MaxAbsScaler};
110pub use min_max_scaler::{FittedMinMaxScaler, MinMaxScaler};
111pub use normalizer::Normalizer;
112pub use one_hot_encoder::{FittedOneHotEncoder, OneHotEncoder};
113pub use ordinal_encoder::{FittedOrdinalEncoder, OrdinalEncoder};
114pub use polynomial_features::PolynomialFeatures;
115pub use power_transformer::{FittedPowerTransformer, PowerTransformer};
116pub use robust_scaler::{FittedRobustScaler, RobustScaler};
117pub use standard_scaler::{FittedStandardScaler, StandardScaler};
118
119// Phase 3 re-exports
120pub use binary_encoder::{BinaryEncoder, FittedBinaryEncoder};
121pub use iterative_imputer::{FittedIterativeImputer, InitialStrategy, IterativeImputer};
122pub use kbins_discretizer::{BinEncoding, BinStrategy, FittedKBinsDiscretizer, KBinsDiscretizer};
123pub use knn_imputer::{FittedKNNImputer, KNNImputer, KNNWeights};
124pub use quantile_transformer::{
125 FittedQuantileTransformer, OutputDistribution, QuantileTransformer,
126};
127pub use rfe::{RFE, RFECV};
128pub use select_percentile::{FittedSelectPercentile, SelectPercentile};
129pub use spline_transformer::{FittedSplineTransformer, KnotStrategy, SplineTransformer};
130pub use target_encoder::{FittedTargetEncoder, TargetEncoder};