scirs2_datasets/utils/dataset.rs
1//! Core Dataset structure and basic methods
2//!
3//! This module provides the main Dataset struct used throughout the datasets
4//! crate, along with its core methods for creation, metadata management, and
5//! basic properties.
6
7use crate::utils::serialization;
8use scirs2_core::ndarray::{Array1, Array2};
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11
12/// Represents a dataset with features, optional targets, and metadata
13///
14/// The Dataset struct is the core data structure for managing machine learning
15/// datasets. It stores the feature matrix, optional target values, and rich
16/// metadata including feature names, descriptions, and arbitrary key-value pairs.
17///
18/// # Examples
19///
20/// ```rust
21/// use scirs2_core::ndarray::Array2;
22/// use scirs2_datasets::utils::Dataset;
23///
24/// let data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
25/// let dataset = Dataset::new(data, None)
26/// .with_featurenames(vec!["feature1".to_string(), "feature2".to_string()])
27/// .with_description("Sample dataset".to_string());
28///
29/// assert_eq!(dataset.n_samples(), 3);
30/// assert_eq!(dataset.n_features(), 2);
31/// ```
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct Dataset {
34 /// Features/data matrix (n_samples, n_features)
35 #[serde(
36 serialize_with = "serialization::serialize_array2",
37 deserialize_with = "serialization::deserialize_array2"
38 )]
39 pub data: Array2<f64>,
40
41 /// Optional target values
42 #[serde(skip_serializing_if = "Option::is_none")]
43 pub target: Option<Array1<f64>>,
44
45 /// Optional target names for classification problems
46 #[serde(skip_serializing_if = "Option::is_none")]
47 pub targetnames: Option<Vec<String>>,
48
49 /// Optional feature names
50 #[serde(skip_serializing_if = "Option::is_none")]
51 pub featurenames: Option<Vec<String>>,
52
53 /// Optional descriptions for each feature
54 #[serde(skip_serializing_if = "Option::is_none")]
55 pub feature_descriptions: Option<Vec<String>>,
56
57 /// Optional dataset description
58 #[serde(skip_serializing_if = "Option::is_none")]
59 pub description: Option<String>,
60
61 /// Optional dataset metadata
62 pub metadata: HashMap<String, String>,
63}
64
65impl Dataset {
66 /// Create a new dataset with the given data and target
67 ///
68 /// # Arguments
69 ///
70 /// * `data` - The feature matrix (n_samples, n_features)
71 /// * `target` - Optional target values (n_samples,)
72 ///
73 /// # Returns
74 ///
75 /// A new Dataset instance with empty metadata
76 ///
77 /// # Examples
78 ///
79 /// ```rust
80 /// use scirs2_core::ndarray::{Array1, Array2};
81 /// use scirs2_datasets::utils::Dataset;
82 ///
83 /// let data = Array2::zeros((100, 5));
84 /// let target = Some(Array1::zeros(100));
85 /// let dataset = Dataset::new(data, target);
86 /// ```
87 pub fn new(data: Array2<f64>, target: Option<Array1<f64>>) -> Self {
88 Dataset {
89 data,
90 target,
91 targetnames: None,
92 featurenames: None,
93 feature_descriptions: None,
94 description: None,
95 metadata: HashMap::new(),
96 }
97 }
98
99 /// Create a new dataset with the given data, target, and metadata
100 ///
101 /// # Arguments
102 ///
103 /// * `data` - The feature matrix (n_samples, n_features)
104 /// * `target` - Optional target values (n_samples,)
105 /// * `metadata` - Dataset metadata information
106 ///
107 /// # Returns
108 ///
109 /// A new Dataset instance with metadata applied
110 pub fn from_metadata(
111 data: Array2<f64>,
112 target: Option<Array1<f64>>,
113 metadata: crate::registry::DatasetMetadata,
114 ) -> Self {
115 let mut dataset_metadata = HashMap::new();
116 dataset_metadata.insert("name".to_string(), metadata.name);
117 dataset_metadata.insert("task_type".to_string(), metadata.task_type);
118 dataset_metadata.insert("n_samples".to_string(), metadata.n_samples.to_string());
119 dataset_metadata.insert("n_features".to_string(), metadata.n_features.to_string());
120
121 Dataset {
122 data,
123 target,
124 targetnames: metadata.targetnames,
125 featurenames: None,
126 feature_descriptions: None,
127 description: Some(metadata.description),
128 metadata: dataset_metadata,
129 }
130 }
131
132 /// Add target names to the dataset (builder pattern)
133 ///
134 /// # Arguments
135 ///
136 /// * `targetnames` - Vector of target class names
137 ///
138 /// # Returns
139 ///
140 /// Self for method chaining
141 pub fn with_targetnames(mut self, targetnames: Vec<String>) -> Self {
142 self.targetnames = Some(targetnames);
143 self
144 }
145
146 /// Add feature names to the dataset (builder pattern)
147 ///
148 /// # Arguments
149 ///
150 /// * `featurenames` - Vector of feature names
151 ///
152 /// # Returns
153 ///
154 /// Self for method chaining
155 pub fn with_featurenames(mut self, featurenames: Vec<String>) -> Self {
156 self.featurenames = Some(featurenames);
157 self
158 }
159
160 /// Add feature descriptions to the dataset (builder pattern)
161 ///
162 /// # Arguments
163 ///
164 /// * `feature_descriptions` - Vector of feature descriptions
165 ///
166 /// # Returns
167 ///
168 /// Self for method chaining
169 pub fn with_feature_descriptions(mut self, featuredescriptions: Vec<String>) -> Self {
170 self.feature_descriptions = Some(featuredescriptions);
171 self
172 }
173
174 /// Add a description to the dataset (builder pattern)
175 ///
176 /// # Arguments
177 ///
178 /// * `description` - Dataset description
179 ///
180 /// # Returns
181 ///
182 /// Self for method chaining
183 pub fn with_description(mut self, description: String) -> Self {
184 self.description = Some(description);
185 self
186 }
187
188 /// Add metadata to the dataset (builder pattern)
189 ///
190 /// # Arguments
191 ///
192 /// * `key` - Metadata key
193 /// * `value` - Metadata value
194 ///
195 /// # Returns
196 ///
197 /// Self for method chaining
198 pub fn with_metadata(mut self, key: &str, value: &str) -> Self {
199 self.metadata.insert(key.to_string(), value.to_string());
200 self
201 }
202
203 /// Get the number of samples in the dataset
204 ///
205 /// # Returns
206 ///
207 /// Number of samples (rows) in the dataset
208 pub fn n_samples(&self) -> usize {
209 self.data.nrows()
210 }
211
212 /// Get the number of features in the dataset
213 ///
214 /// # Returns
215 ///
216 /// Number of features (columns) in the dataset
217 pub fn n_features(&self) -> usize {
218 self.data.ncols()
219 }
220
221 /// Get dataset shape as (n_samples, n_features)
222 ///
223 /// # Returns
224 ///
225 /// Tuple of (n_samples, n_features)
226 pub fn shape(&self) -> (usize, usize) {
227 (self.n_samples(), self.n_features())
228 }
229
230 /// Check if the dataset has target values
231 ///
232 /// # Returns
233 ///
234 /// True if target values are present, false otherwise
235 pub fn has_target(&self) -> bool {
236 self.target.is_some()
237 }
238
239 /// Get a reference to the feature names if available
240 ///
241 /// # Returns
242 ///
243 /// Optional reference to feature names vector
244 pub fn featurenames(&self) -> Option<&Vec<String>> {
245 self.featurenames.as_ref()
246 }
247
248 /// Get a reference to the target names if available
249 ///
250 /// # Returns
251 ///
252 /// Optional reference to target names vector
253 pub fn targetnames(&self) -> Option<&Vec<String>> {
254 self.targetnames.as_ref()
255 }
256
257 /// Get a reference to the dataset description if available
258 ///
259 /// # Returns
260 ///
261 /// Optional reference to dataset description
262 pub fn description(&self) -> Option<&String> {
263 self.description.as_ref()
264 }
265
266 /// Get a reference to the metadata
267 ///
268 /// # Returns
269 ///
270 /// Reference to metadata HashMap
271 pub fn metadata(&self) -> &HashMap<String, String> {
272 &self.metadata
273 }
274
275 /// Add or update a metadata entry
276 ///
277 /// # Arguments
278 ///
279 /// * `key` - Metadata key
280 /// * `value` - Metadata value
281 pub fn set_metadata(&mut self, key: &str, value: &str) {
282 self.metadata.insert(key.to_string(), value.to_string());
283 }
284
285 /// Get a metadata value by key
286 ///
287 /// # Arguments
288 ///
289 /// * `key` - Metadata key to lookup
290 ///
291 /// # Returns
292 ///
293 /// Optional reference to the metadata value
294 pub fn get_metadata(&self, key: &str) -> Option<&String> {
295 self.metadata.get(key)
296 }
297}
298
299#[cfg(test)]
300mod tests {
301 use super::*;
302 use scirs2_core::ndarray::array;
303
304 #[test]
305 fn test_dataset_creation() {
306 let data = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
307 let target = Some(array![0.0, 1.0, 0.0]);
308
309 let dataset = Dataset::new(data.clone(), target.clone());
310
311 assert_eq!(dataset.n_samples(), 3);
312 assert_eq!(dataset.n_features(), 2);
313 assert_eq!(dataset.shape(), (3, 2));
314 assert!(dataset.has_target());
315 assert_eq!(dataset.data, data);
316 assert_eq!(dataset.target, target);
317 }
318
319 #[test]
320 fn test_dataset_builder_pattern() {
321 let data = array![[1.0, 2.0], [3.0, 4.0]];
322
323 let dataset = Dataset::new(data, None)
324 .with_featurenames(vec!["feat1".to_string(), "feat2".to_string()])
325 .with_description("Test dataset".to_string())
326 .with_metadata("version", "1.0")
327 .with_metadata("author", "test");
328
329 assert_eq!(dataset.featurenames().unwrap().len(), 2);
330 assert_eq!(dataset.description().unwrap(), "Test dataset");
331 assert_eq!(dataset.get_metadata("version").unwrap(), "1.0");
332 assert_eq!(dataset.get_metadata("author").unwrap(), "test");
333 }
334
335 #[test]
336 fn test_dataset_without_target() {
337 let data = array![[1.0, 2.0], [3.0, 4.0]];
338 let dataset = Dataset::new(data, None);
339
340 assert!(!dataset.has_target());
341 assert!(dataset.target.is_none());
342 }
343
344 #[test]
345 fn test_metadata_operations() {
346 let data = array![[1.0, 2.0]];
347 let mut dataset = Dataset::new(data, None);
348
349 dataset.set_metadata("key1", "value1");
350 dataset.set_metadata("key2", "value2");
351
352 assert_eq!(dataset.get_metadata("key1").unwrap(), "value1");
353 assert_eq!(dataset.get_metadata("key2").unwrap(), "value2");
354 assert!(dataset.get_metadata("nonexistent").is_none());
355
356 // Update existing key
357 dataset.set_metadata("key1", "updated_value");
358 assert_eq!(dataset.get_metadata("key1").unwrap(), "updated_value");
359 }
360}