Skip to main content

ferrolearn_preprocess/
lib.rs

1//! # ferrolearn-preprocess
2//!
3//! Data preprocessing transformers for the ferrolearn machine learning framework.
4//!
5//! This crate provides standard scalers, encoders, imputers, and feature
6//! selection utilities that follow the ferrolearn `Fit`/`Transform` trait
7//! pattern.
8//!
9//! ## Scalers
10//!
11//! All scalers are generic over `F: Float + Send + Sync + 'static` and implement
12//! [`Fit<Array2<F>, ()>`](ferrolearn_core::Fit) (returning a `Fitted*` type) and
13//! [`FitTransform<Array2<F>>`](ferrolearn_core::FitTransform). The fitted types
14//! implement [`Transform<Array2<F>>`](ferrolearn_core::Transform).
15//!
16//! - [`StandardScaler`] — zero-mean, unit-variance scaling
17//! - [`MinMaxScaler`] — scale features to a given range (default `[0, 1]`)
18//! - [`RobustScaler`] — median / IQR-based scaling, robust to outliers
19//! - [`MaxAbsScaler`] — scale by maximum absolute value so values are in `[-1, 1]`
20//! - [`normalizer::Normalizer`] — normalize each sample (row) to unit norm
21//! - [`power_transformer::PowerTransformer`] — Yeo-Johnson power transform
22//!
23//! ## Encoders
24//!
25//! - [`OneHotEncoder`] — encode `Array2<usize>` categorical columns as binary columns
26//! - [`LabelEncoder`] — map `Array1<String>` labels to integer indices
27//! - [`ordinal_encoder::OrdinalEncoder`] — map string categories to integers in
28//!   order of first appearance
29//!
30//! ## Imputers
31//!
32//! - [`imputer::SimpleImputer`] — fill missing (NaN) values per feature column
33//!   using Mean, Median, MostFrequent, or Constant strategy.
34//!
35//! ## Feature Selection
36//!
37//! - [`feature_selection::VarianceThreshold`] — remove features with variance
38//!   below a configurable threshold.
39//! - [`feature_selection::SelectKBest`] — keep the K features with the highest
40//!   ANOVA F-scores against class labels.
41//! - [`feature_selection::SelectFromModel`] — keep features whose importance
42//!   weight (from a pre-fitted model) meets a configurable threshold.
43//!
44//! ## Feature Engineering
45//!
46//! - [`polynomial_features::PolynomialFeatures`] — generate polynomial and interaction features
47//! - [`binarizer::Binarizer`] — threshold features to binary values
48//! - [`function_transformer::FunctionTransformer`] — apply a user-provided function element-wise
49//!
50//! ## Pipeline Integration
51//!
52//! `StandardScaler<f64>`, `MinMaxScaler<f64>`, `RobustScaler<f64>`,
53//! `MaxAbsScaler<f64>`, `Normalizer<f64>`, `PowerTransformer<f64>`,
54//! `PolynomialFeatures<f64>`, `SimpleImputer<f64>`, `VarianceThreshold<f64>`,
55//! `SelectKBest<f64>`, and `SelectFromModel<f64>` each implement
56//! [`PipelineTransformer`](ferrolearn_core::pipeline::PipelineTransformer)
57//! so they can be used as steps inside a
58//! [`Pipeline`](ferrolearn_core::pipeline::Pipeline).
59//!
60//! # Examples
61//!
62//! ```
63//! use ferrolearn_preprocess::StandardScaler;
64//! use ferrolearn_core::traits::FitTransform;
65//! use ndarray::array;
66//!
67//! let x = array![[1.0_f64, 10.0], [2.0, 20.0], [3.0, 30.0]];
68//! let scaled = StandardScaler::<f64>::new().fit_transform(&x).unwrap();
69//! // scaled columns now have mean ≈ 0 and std ≈ 1
70//! ```
71
72pub mod binarizer;
73pub mod binary_encoder;
74pub mod column_transformer;
75pub mod count_vectorizer;
76pub mod feature_scoring;
77pub mod feature_selection;
78pub mod function_transformer;
79pub mod imputer;
80pub mod iterative_imputer;
81pub mod kbins_discretizer;
82pub mod knn_imputer;
83pub mod label_binarizer;
84pub mod label_encoder;
85pub mod max_abs_scaler;
86pub mod min_max_scaler;
87pub mod multi_label_binarizer;
88pub mod normalizer;
89pub mod one_hot_encoder;
90pub mod ordinal_encoder;
91pub mod polynomial_features;
92pub mod power_transformer;
93pub mod quantile_transformer;
94pub mod random_projection;
95pub mod rfe;
96pub mod robust_scaler;
97pub mod select_percentile;
98pub mod sequential_feature_selector;
99pub mod spline_transformer;
100pub mod standard_scaler;
101pub mod stat_selectors;
102pub mod target_encoder;
103pub mod tfidf;
104
105// Re-exports
106pub use binarizer::Binarizer;
107pub use column_transformer::{
108    ColumnSelector, ColumnTransformer, FittedColumnTransformer, Remainder, make_column_transformer,
109};
110pub use feature_selection::{
111    FittedSelectKBest, FittedVarianceThreshold, ScoreFunc, SelectFromModel, SelectKBest,
112    VarianceThreshold,
113};
114pub use function_transformer::FunctionTransformer;
115pub use imputer::{FittedSimpleImputer, ImputeStrategy, SimpleImputer};
116pub use label_encoder::{FittedLabelEncoder, LabelEncoder};
117pub use max_abs_scaler::{FittedMaxAbsScaler, MaxAbsScaler};
118pub use min_max_scaler::{FittedMinMaxScaler, MinMaxScaler};
119pub use normalizer::Normalizer;
120pub use one_hot_encoder::{FittedOneHotEncoder, OneHotEncoder};
121pub use ordinal_encoder::{FittedOrdinalEncoder, OrdinalEncoder};
122pub use polynomial_features::PolynomialFeatures;
123pub use power_transformer::{FittedPowerTransformer, PowerTransformer};
124pub use robust_scaler::{FittedRobustScaler, RobustScaler};
125pub use standard_scaler::{FittedStandardScaler, StandardScaler};
126
127// Phase 3 re-exports
128pub use binary_encoder::{BinaryEncoder, FittedBinaryEncoder};
129pub use iterative_imputer::{FittedIterativeImputer, InitialStrategy, IterativeImputer};
130pub use kbins_discretizer::{BinEncoding, BinStrategy, FittedKBinsDiscretizer, KBinsDiscretizer};
131pub use knn_imputer::{FittedKNNImputer, KNNImputer, KNNWeights};
132pub use quantile_transformer::{
133    FittedQuantileTransformer, OutputDistribution, QuantileTransformer,
134};
135pub use rfe::{RFE, RFECV};
136pub use select_percentile::{FittedSelectPercentile, SelectPercentile};
137pub use spline_transformer::{FittedSplineTransformer, KnotStrategy, SplineTransformer};
138pub use target_encoder::{FittedTargetEncoder, TargetEncoder};
139
140// Text processing re-exports
141pub use count_vectorizer::{CountVectorizer, FittedCountVectorizer};
142pub use tfidf::{FittedTfidfTransformer, TfidfNorm, TfidfTransformer};
143
144// Random projection re-exports
145pub use random_projection::{
146    FittedGaussianRandomProjection, FittedSparseRandomProjection, GaussianRandomProjection,
147    SparseRandomProjection,
148};
149
150// Newly wired (previously orphaned) re-exports
151pub use feature_scoring::{chi2, compute_scores_classif, compute_scores_regression, f_classif, f_regression};
152pub use label_binarizer::{FittedLabelBinarizer, LabelBinarizer};
153pub use multi_label_binarizer::{FittedMultiLabelBinarizer, MultiLabelBinarizer};
154pub use sequential_feature_selector::{
155    Direction, FittedSequentialFeatureSelector, SequentialFeatureSelector,
156};
157pub use stat_selectors::{
158    FittedSelectFdr, FittedSelectFpr, FittedSelectFwe, SelectFdr, SelectFpr, SelectFwe,
159};