sklears_core/dataset/builder.rs
1/// Dataset builder pattern with compile-time validation
2///
3/// This module provides a type-safe builder pattern for constructing datasets.
4/// The builder uses phantom types to ensure that both data and targets are
5/// provided before the dataset can be built, catching errors at compile time.
6use crate::dataset::core::Dataset;
7
8/// Marker type indicating data has not been set in the builder
9#[derive(Debug)]
10pub struct NoData;
11
12/// Marker type indicating data has been set in the builder
13#[derive(Debug)]
14pub struct HasData;
15
16/// Marker type indicating target has not been set in the builder
17#[derive(Debug)]
18pub struct NoTarget;
19
20/// Marker type indicating target has been set in the builder
21#[derive(Debug)]
22pub struct HasTarget;
23
24/// Type-safe builder for Dataset construction with compile-time validation
25///
26/// The DatasetBuilder uses phantom types to track whether data and targets
27/// have been set, preventing incomplete datasets from being constructed.
28/// This provides compile-time safety without runtime overhead.
29///
30/// # Type Parameters
31///
32/// - `X`: Type of the feature data
33/// - `Y`: Type of the target data
34/// - `DataState`: Phantom type tracking data state (NoData/HasData)
35/// - `TargetState`: Phantom type tracking target state (NoTarget/HasTarget)
36///
37/// # Examples
38///
39/// ```rust
40/// use sklears_core::dataset::Dataset;
41/// use scirs2_core::ndarray::{Array1, Array2};
42///
43/// let features = Array2::`<f64>`::zeros((100, 4));
44/// let targets = Array1::`<f64>`::zeros(100);
45///
46/// let dataset = Dataset::builder()
47/// .data(features)
48/// .target(targets)
49/// .description("My dataset".to_string())
50/// .feature_names(vec!["f1".to_string(), "f2".to_string()])
51/// .build();
52/// ```
53#[derive(Debug)]
54pub struct DatasetBuilder<X, Y, DataState, TargetState> {
55 data: Option<X>,
56 target: Option<Y>,
57 feature_names: Vec<String>,
58 target_names: Option<Vec<String>>,
59 description: String,
60 _phantom_data: std::marker::PhantomData<DataState>,
61 _phantom_target: std::marker::PhantomData<TargetState>,
62}
63
64impl<X, Y> DatasetBuilder<X, Y, NoData, NoTarget> {
65 /// Create a new dataset builder
66 ///
67 /// The builder starts in the initial state where neither data nor
68 /// targets have been set. Both must be provided before build() can be called.
69 ///
70 /// # Returns
71 ///
72 /// A new DatasetBuilder in the initial (NoData, NoTarget) state
73 pub fn new() -> Self {
74 Self {
75 data: None,
76 target: None,
77 feature_names: Vec::new(),
78 target_names: None,
79 description: String::new(),
80 _phantom_data: std::marker::PhantomData,
81 _phantom_target: std::marker::PhantomData,
82 }
83 }
84}
85
86impl<X, Y, TargetState> DatasetBuilder<X, Y, NoData, TargetState> {
87 /// Set the feature data (required)
88 ///
89 /// This method transitions the builder from NoData to HasData state,
90 /// bringing us closer to being able to build the dataset.
91 ///
92 /// # Arguments
93 ///
94 /// * `data` - The feature matrix or data structure
95 ///
96 /// # Returns
97 ///
98 /// DatasetBuilder with data set (HasData state)
99 pub fn data(self, data: X) -> DatasetBuilder<X, Y, HasData, TargetState> {
100 DatasetBuilder {
101 data: Some(data),
102 target: self.target,
103 feature_names: self.feature_names,
104 target_names: self.target_names,
105 description: self.description,
106 _phantom_data: std::marker::PhantomData,
107 _phantom_target: std::marker::PhantomData,
108 }
109 }
110}
111
112impl<X, Y, DataState> DatasetBuilder<X, Y, DataState, NoTarget> {
113 /// Set the target data (required)
114 ///
115 /// This method transitions the builder from NoTarget to HasTarget state,
116 /// bringing us closer to being able to build the dataset.
117 ///
118 /// # Arguments
119 ///
120 /// * `target` - The target values corresponding to the features
121 ///
122 /// # Returns
123 ///
124 /// DatasetBuilder with target set (HasTarget state)
125 pub fn target(self, target: Y) -> DatasetBuilder<X, Y, DataState, HasTarget> {
126 DatasetBuilder {
127 data: self.data,
128 target: Some(target),
129 feature_names: self.feature_names,
130 target_names: self.target_names,
131 description: self.description,
132 _phantom_data: std::marker::PhantomData,
133 _phantom_target: std::marker::PhantomData,
134 }
135 }
136}
137
138impl<X, Y, DataState, TargetState> DatasetBuilder<X, Y, DataState, TargetState> {
139 /// Set feature names (optional)
140 ///
141 /// Feature names improve interpretability and are used in various
142 /// visualization and analysis tools. This is an optional step.
143 ///
144 /// # Arguments
145 ///
146 /// * `names` - Vector of feature names
147 ///
148 /// # Returns
149 ///
150 /// Self with updated feature names
151 pub fn feature_names(mut self, names: Vec<String>) -> Self {
152 self.feature_names = names;
153 self
154 }
155
156 /// Set target names (optional)
157 ///
158 /// Target names are particularly useful for multi-class classification
159 /// where class labels need to be interpretable. This is an optional step.
160 ///
161 /// # Arguments
162 ///
163 /// * `names` - Vector of class/target names
164 ///
165 /// # Returns
166 ///
167 /// Self with updated target names
168 pub fn target_names(mut self, names: Vec<String>) -> Self {
169 self.target_names = Some(names);
170 self
171 }
172
173 /// Set dataset description (optional)
174 ///
175 /// Descriptions are useful for documenting the source, preprocessing
176 /// steps, or other relevant information. This is an optional step.
177 ///
178 /// # Arguments
179 ///
180 /// * `description` - String description of the dataset
181 ///
182 /// # Returns
183 ///
184 /// Self with updated description
185 pub fn description<S: Into<String>>(mut self, description: S) -> Self {
186 self.description = description.into();
187 self
188 }
189}
190
191impl<X, Y> DatasetBuilder<X, Y, HasData, HasTarget> {
192 /// Build the final dataset
193 ///
194 /// This method is only available when both data and target have been set,
195 /// ensuring compile-time safety. The unwrap() calls are safe because
196 /// the type system guarantees the values are present.
197 ///
198 /// # Returns
199 ///
200 /// A completed Dataset instance
201 pub fn build(self) -> Dataset<X, Y> {
202 Dataset {
203 data: self.data.unwrap(), // Safe: HasData state guarantees this exists
204 target: self.target.unwrap(), // Safe: HasTarget state guarantees this exists
205 feature_names: self.feature_names,
206 target_names: self.target_names,
207 description: self.description,
208 }
209 }
210}
211
212/// Default implementation for the initial builder state
213impl<X, Y> Default for DatasetBuilder<X, Y, NoData, NoTarget> {
214 fn default() -> Self {
215 Self::new()
216 }
217}
218
219#[allow(non_snake_case)]
220#[cfg(test)]
221mod tests {
222 use super::*;
223 use crate::types::{Array1, Array2};
224
225 #[test]
226 fn test_builder_pattern() {
227 let data = Array2::<f64>::zeros((10, 3));
228 let target = Array1::<f64>::zeros(10);
229
230 let dataset = DatasetBuilder::new()
231 .data(data)
232 .target(target)
233 .description("Test dataset")
234 .feature_names(vec!["f1".to_string(), "f2".to_string(), "f3".to_string()])
235 .build();
236
237 assert_eq!(dataset.description, "Test dataset");
238 assert_eq!(dataset.feature_names.len(), 3);
239 }
240
241 #[test]
242 fn test_builder_order_independence() {
243 let data = Array2::<f64>::zeros((5, 2));
244 let target = Array1::<f64>::zeros(5);
245
246 // Test that data and target can be set in any order
247 let dataset1 = DatasetBuilder::new()
248 .data(data.clone())
249 .target(target.clone())
250 .build();
251
252 let dataset2 = DatasetBuilder::new().target(target).data(data).build();
253
254 // Both should have the same structure
255 assert_eq!(dataset1.data.dim(), dataset2.data.dim());
256 assert_eq!(dataset1.target.len(), dataset2.target.len());
257 }
258
259 #[test]
260 fn test_builder_with_all_metadata() {
261 let data = Array2::<f64>::ones((3, 2));
262 let target = Array1::<f64>::ones(3);
263
264 let dataset = DatasetBuilder::new()
265 .data(data)
266 .target(target)
267 .feature_names(vec!["feature1".to_string(), "feature2".to_string()])
268 .target_names(vec!["class_a".to_string(), "class_b".to_string()])
269 .description("Complete dataset example")
270 .build();
271
272 assert_eq!(dataset.feature_names.len(), 2);
273 assert!(dataset.target_names.is_some());
274 assert_eq!(dataset.target_names.as_ref().unwrap().len(), 2);
275 assert_eq!(dataset.description, "Complete dataset example");
276 }
277}