sklears_core/dataset/
builder.rs

1/// Dataset builder pattern with compile-time validation
2///
3/// This module provides a type-safe builder pattern for constructing datasets.
4/// The builder uses phantom types to ensure that both data and targets are
5/// provided before the dataset can be built, catching errors at compile time.
6use crate::dataset::core::Dataset;
7
8/// Marker type indicating data has not been set in the builder
9#[derive(Debug)]
10pub struct NoData;
11
12/// Marker type indicating data has been set in the builder
13#[derive(Debug)]
14pub struct HasData;
15
16/// Marker type indicating target has not been set in the builder
17#[derive(Debug)]
18pub struct NoTarget;
19
20/// Marker type indicating target has been set in the builder
21#[derive(Debug)]
22pub struct HasTarget;
23
24/// Type-safe builder for Dataset construction with compile-time validation
25///
26/// The DatasetBuilder uses phantom types to track whether data and targets
27/// have been set, preventing incomplete datasets from being constructed.
28/// This provides compile-time safety without runtime overhead.
29///
30/// # Type Parameters
31///
32/// - `X`: Type of the feature data
33/// - `Y`: Type of the target data
34/// - `DataState`: Phantom type tracking data state (NoData/HasData)
35/// - `TargetState`: Phantom type tracking target state (NoTarget/HasTarget)
36///
37/// # Examples
38///
39/// ```rust
40/// use sklears_core::dataset::Dataset;
41/// use scirs2_core::ndarray::{Array1, Array2};
42///
43/// let features = Array2::`<f64>`::zeros((100, 4));
44/// let targets = Array1::`<f64>`::zeros(100);
45///
46/// let dataset = Dataset::builder()
47///     .data(features)
48///     .target(targets)
49///     .description("My dataset".to_string())
50///     .feature_names(vec!["f1".to_string(), "f2".to_string()])
51///     .build();
52/// ```
53#[derive(Debug)]
54pub struct DatasetBuilder<X, Y, DataState, TargetState> {
55    data: Option<X>,
56    target: Option<Y>,
57    feature_names: Vec<String>,
58    target_names: Option<Vec<String>>,
59    description: String,
60    _phantom_data: std::marker::PhantomData<DataState>,
61    _phantom_target: std::marker::PhantomData<TargetState>,
62}
63
64impl<X, Y> DatasetBuilder<X, Y, NoData, NoTarget> {
65    /// Create a new dataset builder
66    ///
67    /// The builder starts in the initial state where neither data nor
68    /// targets have been set. Both must be provided before build() can be called.
69    ///
70    /// # Returns
71    ///
72    /// A new DatasetBuilder in the initial (NoData, NoTarget) state
73    pub fn new() -> Self {
74        Self {
75            data: None,
76            target: None,
77            feature_names: Vec::new(),
78            target_names: None,
79            description: String::new(),
80            _phantom_data: std::marker::PhantomData,
81            _phantom_target: std::marker::PhantomData,
82        }
83    }
84}
85
86impl<X, Y, TargetState> DatasetBuilder<X, Y, NoData, TargetState> {
87    /// Set the feature data (required)
88    ///
89    /// This method transitions the builder from NoData to HasData state,
90    /// bringing us closer to being able to build the dataset.
91    ///
92    /// # Arguments
93    ///
94    /// * `data` - The feature matrix or data structure
95    ///
96    /// # Returns
97    ///
98    /// DatasetBuilder with data set (HasData state)
99    pub fn data(self, data: X) -> DatasetBuilder<X, Y, HasData, TargetState> {
100        DatasetBuilder {
101            data: Some(data),
102            target: self.target,
103            feature_names: self.feature_names,
104            target_names: self.target_names,
105            description: self.description,
106            _phantom_data: std::marker::PhantomData,
107            _phantom_target: std::marker::PhantomData,
108        }
109    }
110}
111
112impl<X, Y, DataState> DatasetBuilder<X, Y, DataState, NoTarget> {
113    /// Set the target data (required)
114    ///
115    /// This method transitions the builder from NoTarget to HasTarget state,
116    /// bringing us closer to being able to build the dataset.
117    ///
118    /// # Arguments
119    ///
120    /// * `target` - The target values corresponding to the features
121    ///
122    /// # Returns
123    ///
124    /// DatasetBuilder with target set (HasTarget state)
125    pub fn target(self, target: Y) -> DatasetBuilder<X, Y, DataState, HasTarget> {
126        DatasetBuilder {
127            data: self.data,
128            target: Some(target),
129            feature_names: self.feature_names,
130            target_names: self.target_names,
131            description: self.description,
132            _phantom_data: std::marker::PhantomData,
133            _phantom_target: std::marker::PhantomData,
134        }
135    }
136}
137
138impl<X, Y, DataState, TargetState> DatasetBuilder<X, Y, DataState, TargetState> {
139    /// Set feature names (optional)
140    ///
141    /// Feature names improve interpretability and are used in various
142    /// visualization and analysis tools. This is an optional step.
143    ///
144    /// # Arguments
145    ///
146    /// * `names` - Vector of feature names
147    ///
148    /// # Returns
149    ///
150    /// Self with updated feature names
151    pub fn feature_names(mut self, names: Vec<String>) -> Self {
152        self.feature_names = names;
153        self
154    }
155
156    /// Set target names (optional)
157    ///
158    /// Target names are particularly useful for multi-class classification
159    /// where class labels need to be interpretable. This is an optional step.
160    ///
161    /// # Arguments
162    ///
163    /// * `names` - Vector of class/target names
164    ///
165    /// # Returns
166    ///
167    /// Self with updated target names
168    pub fn target_names(mut self, names: Vec<String>) -> Self {
169        self.target_names = Some(names);
170        self
171    }
172
173    /// Set dataset description (optional)
174    ///
175    /// Descriptions are useful for documenting the source, preprocessing
176    /// steps, or other relevant information. This is an optional step.
177    ///
178    /// # Arguments
179    ///
180    /// * `description` - String description of the dataset
181    ///
182    /// # Returns
183    ///
184    /// Self with updated description
185    pub fn description<S: Into<String>>(mut self, description: S) -> Self {
186        self.description = description.into();
187        self
188    }
189}
190
191impl<X, Y> DatasetBuilder<X, Y, HasData, HasTarget> {
192    /// Build the final dataset
193    ///
194    /// This method is only available when both data and target have been set,
195    /// ensuring compile-time safety. The unwrap() calls are safe because
196    /// the type system guarantees the values are present.
197    ///
198    /// # Returns
199    ///
200    /// A completed Dataset instance
201    pub fn build(self) -> Dataset<X, Y> {
202        Dataset {
203            data: self.data.unwrap(),     // Safe: HasData state guarantees this exists
204            target: self.target.unwrap(), // Safe: HasTarget state guarantees this exists
205            feature_names: self.feature_names,
206            target_names: self.target_names,
207            description: self.description,
208        }
209    }
210}
211
212/// Default implementation for the initial builder state
213impl<X, Y> Default for DatasetBuilder<X, Y, NoData, NoTarget> {
214    fn default() -> Self {
215        Self::new()
216    }
217}
218
219#[allow(non_snake_case)]
220#[cfg(test)]
221mod tests {
222    use super::*;
223    use crate::types::{Array1, Array2};
224
225    #[test]
226    fn test_builder_pattern() {
227        let data = Array2::<f64>::zeros((10, 3));
228        let target = Array1::<f64>::zeros(10);
229
230        let dataset = DatasetBuilder::new()
231            .data(data)
232            .target(target)
233            .description("Test dataset")
234            .feature_names(vec!["f1".to_string(), "f2".to_string(), "f3".to_string()])
235            .build();
236
237        assert_eq!(dataset.description, "Test dataset");
238        assert_eq!(dataset.feature_names.len(), 3);
239    }
240
241    #[test]
242    fn test_builder_order_independence() {
243        let data = Array2::<f64>::zeros((5, 2));
244        let target = Array1::<f64>::zeros(5);
245
246        // Test that data and target can be set in any order
247        let dataset1 = DatasetBuilder::new()
248            .data(data.clone())
249            .target(target.clone())
250            .build();
251
252        let dataset2 = DatasetBuilder::new().target(target).data(data).build();
253
254        // Both should have the same structure
255        assert_eq!(dataset1.data.dim(), dataset2.data.dim());
256        assert_eq!(dataset1.target.len(), dataset2.target.len());
257    }
258
259    #[test]
260    fn test_builder_with_all_metadata() {
261        let data = Array2::<f64>::ones((3, 2));
262        let target = Array1::<f64>::ones(3);
263
264        let dataset = DatasetBuilder::new()
265            .data(data)
266            .target(target)
267            .feature_names(vec!["feature1".to_string(), "feature2".to_string()])
268            .target_names(vec!["class_a".to_string(), "class_b".to_string()])
269            .description("Complete dataset example")
270            .build();
271
272        assert_eq!(dataset.feature_names.len(), 2);
273        assert!(dataset.target_names.is_some());
274        assert_eq!(dataset.target_names.as_ref().unwrap().len(), 2);
275        assert_eq!(dataset.description, "Complete dataset example");
276    }
277}