Expand description
Preprocessing utilities for sklears
This crate provides data preprocessing utilities including:
- Scaling (StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer)
- Encoding (LabelEncoder, OneHotEncoder, OrdinalEncoder)
- Imputation (SimpleImputer, KNNImputer, IterativeImputer, GAINImputer)
- Feature engineering (PolynomialFeatures, SplineTransformer, PowerTransformer, FunctionTransformer)
- Text processing (TfIdfVectorizer, TextTokenizer, NgramGenerator, TextSimilarity, BagOfWordsEmbedding)
- Advanced pipelines (conditional steps, parallel branches, caching, dynamic construction)
Re-exports§
pub use adaptive::AdaptationStrategy;pub use adaptive::AdaptiveConfig;pub use adaptive::AdaptiveParameterSelector;pub use adaptive::DataCharacteristics;pub use adaptive::DistributionType;pub use adaptive::ImputationParameters;pub use adaptive::OutlierDetectionParameters;pub use adaptive::ParameterEvaluation;pub use adaptive::ParameterRecommendations;pub use adaptive::ScalingParameters;pub use adaptive::TransformationParameters as AdaptiveTransformationParameters;pub use automated_feature_engineering::AutoFeatureConfig;pub use automated_feature_engineering::AutoFeatureEngineer;pub use automated_feature_engineering::AutoFeatureEngineerFitted;pub use automated_feature_engineering::Domain;pub use automated_feature_engineering::GenerationStrategy;pub use automated_feature_engineering::MathFunction;pub use automated_feature_engineering::SelectionMethod;pub use automated_feature_engineering::TransformationFunction;pub use automated_feature_engineering::TransformationType;pub use binarization::Binarizer;pub use binarization::BinarizerConfig;pub use binarization::DiscretizationStrategy;pub use binarization::DiscretizerEncoding;pub use binarization::KBinsDiscretizer;pub use binarization::KBinsDiscretizerConfig;pub use column_transformer::ColumnSelector;pub use column_transformer::ColumnTransformer;pub use column_transformer::ColumnTransformerConfig;pub use column_transformer::DataType;pub use column_transformer::RemainderStrategy;pub use column_transformer::TransformerStep;pub use column_transformer::TransformerWrapper;pub use cross_validation::CVScore;pub use cross_validation::InformationPreservationMetric;pub use cross_validation::KFold;pub use cross_validation::ParameterDistribution;pub use cross_validation::ParameterGrid;pub use cross_validation::PreprocessingMetric;pub use cross_validation::StratifiedKFold;pub use cross_validation::VariancePreservationMetric;pub use data_quality::CorrelationWarning;pub use data_quality::DataQualityConfig;pub use data_quality::DataQualityReport;pub use data_quality::DataQualityValidator;pub use data_quality::DistributionStats;pub use data_quality::IssueCategory;pub use data_quality::IssueSeverity;pub use data_quality::MissingStats;pub use data_quality::OutlierMethod;pub use data_quality::OutlierStats;pub use data_quality::QualityIssue;pub use dimensionality_reduction::ICAConfig;pub use dimensionality_reduction::ICAFitted;pub use dimensionality_reduction::IcaAlgorithm;pub use dimensionality_reduction::IcaFunction;pub use dimensionality_reduction::LDAConfig;pub use dimensionality_reduction::LDAFitted;pub use dimensionality_reduction::LdaSolver;pub use dimensionality_reduction::NMFConfig;pub use dimensionality_reduction::NMFFitted;pub use dimensionality_reduction::NmfInit;pub use dimensionality_reduction::NmfSolver;pub use dimensionality_reduction::PCAConfig;pub use dimensionality_reduction::PCAFitted;pub use dimensionality_reduction::PcaSolver;pub use dimensionality_reduction::ICA;pub use dimensionality_reduction::LDA;pub use dimensionality_reduction::NMF;pub use dimensionality_reduction::PCA;pub use encoding::BinaryEncoder;pub use encoding::BinaryEncoderConfig;pub use encoding::CategoricalEmbedding;pub use encoding::CategoricalEmbeddingConfig;pub use encoding::FrequencyEncoder;pub use encoding::FrequencyEncoderConfig;pub use encoding::HashEncoder;pub use encoding::HashEncoderConfig;pub use encoding::LabelEncoder;pub use encoding::OneHotEncoder;pub use encoding::OrdinalEncoder;pub use encoding::RareStrategy;pub use encoding::TargetEncoder;pub use feature_engineering::ExtrapolationStrategy;pub use feature_engineering::FeatureOrder;pub use feature_engineering::KnotStrategy;pub use feature_engineering::PolynomialFeatures;pub use feature_engineering::PowerMethod;pub use feature_engineering::PowerTransformer;pub use feature_engineering::PowerTransformerConfig;pub use feature_engineering::SplineTransformer;pub use feature_engineering::SplineTransformerConfig;pub use feature_union::FeatureUnion;pub use feature_union::FeatureUnionConfig;pub use feature_union::FeatureUnionStep;pub use geospatial::calculate_distance;pub use geospatial::haversine_distance;pub use geospatial::vincenty_distance;pub use geospatial::Coordinate;pub use geospatial::CoordinateSystem;pub use geospatial::CoordinateTransformer;pub use geospatial::CoordinateTransformerConfig;pub use geospatial::CoordinateTransformerFitted;pub use geospatial::Geohash;pub use geospatial::GeohashEncoder;pub use geospatial::GeohashEncoderConfig;pub use geospatial::GeohashEncoderFitted;pub use geospatial::ProximityFeatures;pub use geospatial::ProximityFeaturesConfig;pub use geospatial::ProximityFeaturesFitted;pub use geospatial::SpatialAutocorrelation;pub use geospatial::SpatialAutocorrelationConfig;pub use geospatial::SpatialAutocorrelationFitted;pub use geospatial::SpatialBinning;pub use geospatial::SpatialBinningConfig;pub use geospatial::SpatialBinningFitted;pub use geospatial::SpatialClustering;pub use geospatial::SpatialClusteringConfig;pub use geospatial::SpatialClusteringFitted;pub use geospatial::SpatialClusteringMethod;pub use geospatial::SpatialDistanceFeatures;pub use geospatial::SpatialDistanceFeaturesConfig;pub use geospatial::SpatialDistanceFeaturesFitted;pub use geospatial::SpatialDistanceMetric;pub use image_preprocessing::ColorSpace;pub use image_preprocessing::ColorSpaceTransformer;pub use image_preprocessing::EdgeDetectionMethod;pub use image_preprocessing::EdgeDetector;pub use image_preprocessing::ImageAugmenter;pub use image_preprocessing::ImageAugmenterConfig;pub use image_preprocessing::ImageFeatureExtractor;pub use image_preprocessing::ImageNormalizationStrategy;pub use image_preprocessing::ImageNormalizer;pub use image_preprocessing::ImageNormalizerConfig;pub use image_preprocessing::ImageNormalizerFitted;pub use image_preprocessing::ImageResizer;pub use image_preprocessing::InterpolationMethod as ImageInterpolationMethod;pub use imputation::BaseImputationMethod;pub use imputation::DistanceMetric;pub use imputation::FeatureMissingStats;pub use imputation::GAINImputer;pub use imputation::GAINImputerConfig;pub use imputation::ImputationStrategy;pub use imputation::IterativeImputer;pub use imputation::KNNImputer;pub use imputation::MissingPattern;pub use imputation::MissingValueAnalysis;pub use imputation::MissingnessType;pub use imputation::MultipleImputationResult;pub use imputation::MultipleImputer;pub use imputation::MultipleImputerConfig;pub use imputation::OutlierAwareImputer;pub use imputation::OutlierAwareImputerConfig;pub use imputation::OutlierAwareStatistics;pub use imputation::OutlierAwareStrategy;pub use imputation::OverallMissingStats;pub use imputation::SimpleImputer;pub use information_theory::approximate_entropy;pub use information_theory::conditional_entropy;pub use information_theory::joint_entropy;pub use information_theory::lempel_ziv_complexity;pub use information_theory::mutual_information;pub use information_theory::normalized_mutual_information;pub use information_theory::permutation_entropy;pub use information_theory::renyi_entropy;pub use information_theory::sample_entropy;pub use information_theory::shannon_entropy;pub use information_theory::transfer_entropy;pub use information_theory::InformationFeatureSelector;pub use information_theory::InformationFeatureSelectorConfig;pub use information_theory::InformationFeatureSelectorFitted;pub use information_theory::InformationMetric;pub use kernel_centerer::KernelCenterer;pub use label_binarization::LabelBinarizer;pub use label_binarization::LabelBinarizerConfig;pub use label_binarization::MultiLabelBinarizer;pub use label_binarization::MultiLabelBinarizerConfig;pub use monitoring::LogLevel;pub use monitoring::MonitoringConfig;pub use monitoring::MonitoringSession;pub use monitoring::MonitoringSummary;pub use monitoring::TransformationMetrics;pub use outlier_detection::FeatureOutlierParams;pub use outlier_detection::OutlierDetectionMethod;pub use outlier_detection::OutlierDetectionResult;pub use outlier_detection::OutlierDetector;pub use outlier_detection::OutlierDetectorConfig;pub use outlier_detection::OutlierStatistics;pub use outlier_detection::OutlierSummary;pub use outlier_transformation::FeatureTransformationParams;pub use outlier_transformation::GlobalTransformationParams;pub use outlier_transformation::OutlierTransformationConfig;pub use outlier_transformation::OutlierTransformationMethod;pub use outlier_transformation::OutlierTransformer;pub use outlier_transformation::TransformationParameters;pub use pipeline::AdvancedPipeline;pub use pipeline::AdvancedPipelineBuilder;pub use pipeline::AdvancedPipelineConfig;pub use pipeline::BranchCombinationStrategy;pub use pipeline::CacheConfig;pub use pipeline::CacheStats;pub use pipeline::ConditionalStep;pub use pipeline::ConditionalStepConfig;pub use pipeline::DynamicPipeline;pub use pipeline::ErrorHandlingStrategy;pub use pipeline::ParallelBranchConfig;pub use pipeline::ParallelBranches;pub use pipeline::PipelineStep;pub use pipeline::TransformationCache;pub use pipeline_validation::PerformanceRecommendation;pub use pipeline_validation::PipelineValidator;pub use pipeline_validation::PipelineValidatorConfig;pub use pipeline_validation::RecommendationCategory;pub use pipeline_validation::ValidationError;pub use pipeline_validation::ValidationErrorType;pub use pipeline_validation::ValidationResult;pub use pipeline_validation::ValidationWarning;pub use pipeline_validation::WarningSeverity;pub use probabilistic_imputation::BayesianImputer;pub use probabilistic_imputation::BayesianImputerConfig;pub use probabilistic_imputation::BayesianImputerFitted;pub use probabilistic_imputation::EMImputer;pub use probabilistic_imputation::EMImputerConfig;pub use probabilistic_imputation::EMImputerFitted;pub use probabilistic_imputation::GaussianProcessImputer;pub use probabilistic_imputation::GaussianProcessImputerConfig;pub use probabilistic_imputation::GaussianProcessImputerFitted;pub use probabilistic_imputation::MonteCarloBaseMethod;pub use probabilistic_imputation::MonteCarloImputer;pub use probabilistic_imputation::MonteCarloImputerConfig;pub use probabilistic_imputation::MonteCarloImputerFitted;pub use quantile_transformer::QuantileOutput;pub use quantile_transformer::QuantileTransformer;pub use quantile_transformer::QuantileTransformerConfig;pub use robust_preprocessing::MissingValueStats;pub use robust_preprocessing::RobustPreprocessingStats;pub use robust_preprocessing::RobustPreprocessor;pub use robust_preprocessing::RobustPreprocessorConfig;pub use robust_preprocessing::RobustStrategy;pub use robust_preprocessing::TransformationStats;pub use scaling::FeatureWiseScaler;pub use scaling::FeatureWiseScalerConfig;pub use scaling::MaxAbsScaler;pub use scaling::MinMaxScaler;pub use scaling::NormType;pub use scaling::Normalizer;pub use scaling::OutlierAwareScaler;pub use scaling::OutlierAwareScalerConfig;pub use scaling::OutlierAwareScalingStrategy;pub use scaling::OutlierScalingStats;pub use scaling::RobustScaler;pub use scaling::RobustStatistic;pub use scaling::ScalingMethod;pub use scaling::StandardScaler;pub use scaling::UnitVectorScaler;pub use scaling::UnitVectorScalerConfig;pub use simd_optimizations::add_scalar_f64_simd;pub use simd_optimizations::add_vectors_f64_simd;pub use simd_optimizations::mean_f64_simd;pub use simd_optimizations::min_max_f64_simd;pub use simd_optimizations::mul_scalar_f64_simd;pub use simd_optimizations::ndarray_ops;pub use simd_optimizations::sub_vectors_f64_simd;pub use simd_optimizations::variance_f64_simd;pub use simd_optimizations::SimdConfig;pub use sparse_optimizations::sparse_matvec;pub use sparse_optimizations::SparseConfig;pub use sparse_optimizations::SparseFormat;pub use sparse_optimizations::SparseMatrix;pub use sparse_optimizations::SparseStandardScaler;pub use sparse_optimizations::SparseStandardScalerFitted;pub use streaming::AdaptiveConfig as StreamingAdaptiveConfig;pub use streaming::AdaptiveParameterManager;pub use streaming::AdaptiveStreamingMinMaxScaler;pub use streaming::AdaptiveStreamingStandardScaler;pub use streaming::IncrementalPCA;pub use streaming::IncrementalPCAStats;pub use streaming::MiniBatchConfig;pub use streaming::MiniBatchIterator;pub use streaming::MiniBatchPipeline;pub use streaming::MiniBatchStats;pub use streaming::MiniBatchStreamingTransformer;pub use streaming::MiniBatchTransformer;pub use streaming::MultiQuantileEstimator;pub use streaming::OnlineMADEstimator;pub use streaming::OnlineMADStats;pub use streaming::OnlineQuantileEstimator;pub use streaming::OnlineQuantileStats;pub use streaming::ParameterUpdate;pub use streaming::StreamCharacteristics;pub use streaming::StreamingConfig;pub use streaming::StreamingLabelEncoder;pub use streaming::StreamingMinMaxScaler;pub use streaming::StreamingPipeline;pub use streaming::StreamingRobustScaler;pub use streaming::StreamingRobustScalerStats;pub use streaming::StreamingSimpleImputer;pub use streaming::StreamingStandardScaler;pub use streaming::StreamingStats;pub use streaming::StreamingTransformer;pub use temporal::ChangePointDetector;pub use temporal::ChangePointDetectorConfig;pub use temporal::ChangePointMethod;pub use temporal::DateComponents;pub use temporal::DateTime;pub use temporal::DecompositionMethod;pub use temporal::FillMethod;pub use temporal::FourierFeatureGenerator;pub use temporal::FourierFeatureGeneratorConfig;pub use temporal::InterpolationMethod;pub use temporal::LagFeatureGenerator;pub use temporal::LagFeatureGeneratorConfig;pub use temporal::MultiVariateTimeSeriesAligner;pub use temporal::ResamplingMethod;pub use temporal::SeasonalDecomposer;pub use temporal::SeasonalDecomposerConfig;pub use temporal::StationarityMethod;pub use temporal::StationarityTransformer;pub use temporal::StationarityTransformerConfig;pub use temporal::StationarityTransformerFitted;pub use temporal::TemporalFeatureExtractor;pub use temporal::TemporalFeatureExtractorConfig;pub use temporal::TimeSeriesInterpolator;pub use temporal::TimeSeriesResampler;pub use temporal::TrendDetector;pub use temporal::TrendDetectorConfig;pub use temporal::TrendMethod;pub use text::BagOfWordsConfig;pub use text::BagOfWordsEmbedding;pub use text::NgramGenerator;pub use text::NgramGeneratorConfig;pub use text::NgramType;pub use text::NormalizationStrategy;pub use text::SimilarityMetric;pub use text::TextSimilarity;pub use text::TextSimilarityConfig;pub use text::TextTokenizer;pub use text::TextTokenizerConfig;pub use text::TfIdfVectorizer;pub use text::TfIdfVectorizerConfig;pub use text::TokenizationStrategy;pub use type_safety::Dimension;pub use type_safety::Dynamic;pub use type_safety::Fitted;pub use type_safety::Known;pub use type_safety::TransformState;pub use type_safety::TypeSafeConfig;pub use type_safety::TypeSafePipeline;pub use type_safety::TypeSafeTransformer;pub use type_safety::Unfitted;pub use winsorization::NanStrategy;pub use winsorization::WinsorizationStats;pub use winsorization::Winsorizer;pub use winsorization::WinsorizerConfig;pub use functional::add_dummy_feature;pub use functional::binarize;pub use functional::label_binarize;pub use functional::maxabs_scale;pub use functional::minmax_scale;pub use functional::normalize;pub use functional::robust_scale;pub use functional::scale;
Modules§
- adaptive
- Adaptive preprocessing parameters that automatically tune based on data characteristics
- automated_
feature_ engineering - Automated Feature Engineering
- binarization
- Binarization transformers
- column_
transformer - Column Transformer
- cross_
validation - Cross-Validation Utilities for Preprocessing
- data_
quality - Data Quality Validation Framework
- dimensionality_
reduction - Dimensionality Reduction Transformers
- encoding
- Data encoding and categorical feature transformation utilities
- feature_
engineering - Feature engineering utilities
- feature_
union - Feature Union
- functional
- Functional APIs for preprocessing
- geospatial
- Geospatial preprocessing module
- image_
preprocessing - Image Preprocessing for Computer Vision Applications
- imputation
- Missing value imputation utilities
- information_
theory - Information-theoretic features and transformations
- kernel_
centerer - Kernel Centerer for centering kernel matrices
- label_
binarization - Label Binarization transformers
- monitoring
- Transformation Monitoring and Performance Metrics
- outlier_
detection - Outlier detection utilities
- outlier_
transformation - Outlier transformation methods for handling extreme values
- pipeline
- Advanced Pipeline Features for Preprocessing Transformations
- pipeline_
validation - Pipeline Validation Utilities
- prelude
- Prelude module for convenient imports
- probabilistic_
imputation - Probabilistic imputation methods
- quantile_
transformer - Quantile Transformer
- robust_
preprocessing - Robust preprocessing module for outlier-resilient data preprocessing
- scaling
- Data scaling utilities
- simd_
optimizations - SIMD optimizations for preprocessing operations
- sparse_
optimizations - Sparse Matrix Optimizations for Preprocessing
- streaming
- Streaming data preprocessing for large datasets
- temporal
- Temporal feature engineering utilities
- text
- Text preprocessing utilities for sklears
- type_
safety - Advanced type safety for preprocessing transformers
- winsorization
- Winsorization utilities for capping extreme outliers