gbrt_rs/data/mod.rs
1//! Data handling and preprocessing for gradient boosting.
2//!
3//! This module provides comprehensive data structures and utilities for managing
4//! datasets, feature matrices, and preprocessing pipelines. It serves as the
5//! foundation for all data operations in the gradient boosting library.
6//!
7//! # Submodules
8//!
9//! - [`dataset`]: High-level dataset container with features, targets, and metadata
10//! - [`feature_matrix`]: Memory-efficient feature storage and access
11//! - [`preprocessing`]: Data scaling, encoding, and transformation utilities
12//!
13//! # Key Features
14//!
15//! ## Dataset Management
16//! - Unified container for features, targets, and sample metadata
17//! - Type-safe data access and manipulation
18//! - Integration with preprocessing pipelines
19//!
20//! ## Feature Matrix
21//! - Optimized storage for feature data
22//! - Efficient row/column access patterns
23//! - Support for sparse and dense representations
24//!
25//! ## Preprocessing
26//! - Standard scaling (zero mean, unit variance)
27//! - Min-max scaling
28//! - Categorical encoding
29//! - Automated preprocessing pipelines
30
31pub mod dataset;
32pub mod feature_matrix;
33pub mod preprocessing;
34
35// Dataset management
36pub use dataset::{
37 // High-level dataset container with features, targets, and metadata.
38 Dataset
39};
40
41// Feature matrix
42pub use feature_matrix::{
43 // Memory-efficient feature storage with optimized access patterns.
44 FeatureMatrix,
45 // Errors from feature matrix operations.
46 FeatureMatrixError
47};
48
49// Preprocessing
50pub use preprocessing::{
51 // Trait for data scaling transformations.
52 Scaler,
53 // Standard scaling (zero mean, unit variance).
54 StandardScaler,
55 // Min-max scaling to [0, 1] range.
56 MinMaxScaler,
57 // Encode categorical features to numeric representations.
58 CategoricalEncoder,
59 // Automatically detect categorical columns in data.
60 detect_categorical_columns,
61 // Automated preprocessing pipeline builder.
62 DataPreprocessor,
63 // Errors from preprocessing operations.
64 PreprocessingError
65};
66
67/// Unified error type for all data-related operations.
68///
69/// This enum aggregates errors from all data submodules, providing a single
70/// error type for data loading, preprocessing, validation, and access operations.
71///
72/// # Error Variants
73///
74/// - `FeatureMatrixError`: Invalid feature matrix operations (index out of bounds, etc.)
75/// - `InvalidShape`: Data dimensions don't match expectations
76/// - `CsvError`: CSV parsing failure
77/// - `IoError`: File system or I/O operation failure
78/// - `SerializationError`: JSON serialization/deserialization failure
79/// - `InvalidTargets`: Target values are invalid for the task
80/// - `PreprocessingError**: Data transformation or scaling failed
81/// - `ValidationError`: Data validation checks failed
82///
83#[derive(thiserror::Error, Debug)]
84pub enum DataError {
85 /// Feature matrix operation error (index out of bounds, invalid access).
86 #[error("Feature matrix error: {0}")]
87 FeatureMatrixError(#[from] FeatureMatrixError),
88
89 /// Data dimensions don't match expected shape.
90 #[error("Invalid data shape: expected {expected}, got {got}")]
91 InvalidShape {
92 /// Expected shape or dimensions.
93 expected: String,
94 /// Actual shape or dimensions found.
95 got: String
96 },
97
98 /// CSV parsing failed.
99 #[error("CSV parsing error: {0}")]
100 CsvError(#[from] csv::Error),
101
102 /// File system or I/O operation failed.
103 #[error("IO error: {0}")]
104 IoError(#[from] std::io::Error),
105
106 /// JSON serialization or deserialization failed.
107 #[error("Serialization error: {0}")]
108 SerializationError(#[from] serde_json::Error),
109
110 /// Target values are invalid for the task (e.g., wrong cardinality, non-finite).
111 #[error("Invalid target values: {0}")]
112 InvalidTargets(String),
113
114 /// Data preprocessing or transformation failed.
115 #[error("Preprocessing error: {0}")]
116 PreprocessingError(String),
117
118 /// Data validation checks failed (finite values, ranges, etc.).
119 #[error("Data validation error: {0}")]
120 ValidationError(String),
121}
122
123/// Result type for data operations.
124///
125/// This is a convenient type alias for `Result<T, DataError>`.
126pub type DataResult<T> = std::result::Result<T, DataError>;
127