gbrt_rs/data/
mod.rs

1//! Data handling and preprocessing for gradient boosting.
2//!
3//! This module provides comprehensive data structures and utilities for managing
4//! datasets, feature matrices, and preprocessing pipelines. It serves as the
5//! foundation for all data operations in the gradient boosting library.
6//!
7//! # Submodules
8//!
9//! - [`dataset`]: High-level dataset container with features, targets, and metadata
10//! - [`feature_matrix`]: Memory-efficient feature storage and access
11//! - [`preprocessing`]: Data scaling, encoding, and transformation utilities
12//!
13//! # Key Features
14//!
15//! ## Dataset Management
16//! - Unified container for features, targets, and sample metadata
17//! - Type-safe data access and manipulation
18//! - Integration with preprocessing pipelines
19//!
20//! ## Feature Matrix
21//! - Optimized storage for feature data
22//! - Efficient row/column access patterns
23//! - Support for sparse and dense representations
24//!
25//! ## Preprocessing
26//! - Standard scaling (zero mean, unit variance)
27//! - Min-max scaling
28//! - Categorical encoding
29//! - Automated preprocessing pipelines
30
31pub mod dataset;
32pub mod feature_matrix;
33pub mod preprocessing;
34
35// Dataset management
36pub use dataset::{
37    // High-level dataset container with features, targets, and metadata.
38    Dataset
39};
40
41// Feature matrix
42pub use feature_matrix::{
43    // Memory-efficient feature storage with optimized access patterns.
44    FeatureMatrix, 
45    // Errors from feature matrix operations.
46    FeatureMatrixError
47};
48
49// Preprocessing
50pub use preprocessing::{
51    // Trait for data scaling transformations.
52    Scaler, 
53    // Standard scaling (zero mean, unit variance).
54    StandardScaler, 
55    // Min-max scaling to [0, 1] range.
56    MinMaxScaler, 
57    // Encode categorical features to numeric representations.
58    CategoricalEncoder,
59    // Automatically detect categorical columns in data. 
60    detect_categorical_columns, 
61    // Automated preprocessing pipeline builder.
62    DataPreprocessor, 
63    // Errors from preprocessing operations.
64    PreprocessingError
65};
66
67/// Unified error type for all data-related operations.
68///
69/// This enum aggregates errors from all data submodules, providing a single
70/// error type for data loading, preprocessing, validation, and access operations.
71///
72/// # Error Variants
73///
74/// - `FeatureMatrixError`: Invalid feature matrix operations (index out of bounds, etc.)
75/// - `InvalidShape`: Data dimensions don't match expectations
76/// - `CsvError`: CSV parsing failure
77/// - `IoError`: File system or I/O operation failure
78/// - `SerializationError`: JSON serialization/deserialization failure
79/// - `InvalidTargets`: Target values are invalid for the task
80/// - `PreprocessingError**: Data transformation or scaling failed
81/// - `ValidationError`: Data validation checks failed
82///
83#[derive(thiserror::Error, Debug)]
84pub enum DataError {
85    /// Feature matrix operation error (index out of bounds, invalid access).
86    #[error("Feature matrix error: {0}")]
87    FeatureMatrixError(#[from] FeatureMatrixError),
88    
89    /// Data dimensions don't match expected shape.
90    #[error("Invalid data shape: expected {expected}, got {got}")]
91    InvalidShape { 
92        /// Expected shape or dimensions.
93        expected: String, 
94        /// Actual shape or dimensions found.
95        got: String 
96    },
97    
98    /// CSV parsing failed.
99    #[error("CSV parsing error: {0}")]
100    CsvError(#[from] csv::Error),
101    
102    /// File system or I/O operation failed.
103    #[error("IO error: {0}")]
104    IoError(#[from] std::io::Error),
105    
106    /// JSON serialization or deserialization failed.
107    #[error("Serialization error: {0}")]
108    SerializationError(#[from] serde_json::Error),
109    
110    /// Target values are invalid for the task (e.g., wrong cardinality, non-finite).
111    #[error("Invalid target values: {0}")]
112    InvalidTargets(String),
113    
114    /// Data preprocessing or transformation failed.
115    #[error("Preprocessing error: {0}")]
116    PreprocessingError(String),
117
118    /// Data validation checks failed (finite values, ranges, etc.).
119    #[error("Data validation error: {0}")]
120    ValidationError(String),
121}
122
123/// Result type for data operations.
124///
125/// This is a convenient type alias for `Result<T, DataError>`.
126pub type DataResult<T> = std::result::Result<T, DataError>;
127