scirs2_datasets/utils/
dataset.rs

1//! Core Dataset structure and basic methods
2//!
3//! This module provides the main Dataset struct used throughout the datasets
4//! crate, along with its core methods for creation, metadata management, and
5//! basic properties.
6
7use crate::utils::serialization;
8use scirs2_core::ndarray::{Array1, Array2};
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11
12/// Represents a dataset with features, optional targets, and metadata
13///
14/// The Dataset struct is the core data structure for managing machine learning
15/// datasets. It stores the feature matrix, optional target values, and rich
16/// metadata including feature names, descriptions, and arbitrary key-value pairs.
17///
18/// # Examples
19///
20/// ```rust
21/// use scirs2_core::ndarray::Array2;
22/// use scirs2_datasets::utils::Dataset;
23///
24/// let data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
25/// let dataset = Dataset::new(data, None)
26///     .with_featurenames(vec!["feature1".to_string(), "feature2".to_string()])
27///     .with_description("Sample dataset".to_string());
28///
29/// assert_eq!(dataset.n_samples(), 3);
30/// assert_eq!(dataset.n_features(), 2);
31/// ```
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct Dataset {
34    /// Features/data matrix (n_samples, n_features)
35    #[serde(
36        serialize_with = "serialization::serialize_array2",
37        deserialize_with = "serialization::deserialize_array2"
38    )]
39    pub data: Array2<f64>,
40
41    /// Optional target values
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub target: Option<Array1<f64>>,
44
45    /// Optional target names for classification problems
46    #[serde(skip_serializing_if = "Option::is_none")]
47    pub targetnames: Option<Vec<String>>,
48
49    /// Optional feature names
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub featurenames: Option<Vec<String>>,
52
53    /// Optional descriptions for each feature
54    #[serde(skip_serializing_if = "Option::is_none")]
55    pub feature_descriptions: Option<Vec<String>>,
56
57    /// Optional dataset description
58    #[serde(skip_serializing_if = "Option::is_none")]
59    pub description: Option<String>,
60
61    /// Optional dataset metadata
62    pub metadata: HashMap<String, String>,
63}
64
65impl Dataset {
66    /// Create a new dataset with the given data and target
67    ///
68    /// # Arguments
69    ///
70    /// * `data` - The feature matrix (n_samples, n_features)
71    /// * `target` - Optional target values (n_samples,)
72    ///
73    /// # Returns
74    ///
75    /// A new Dataset instance with empty metadata
76    ///
77    /// # Examples
78    ///
79    /// ```rust
80    /// use scirs2_core::ndarray::{Array1, Array2};
81    /// use scirs2_datasets::utils::Dataset;
82    ///
83    /// let data = Array2::zeros((100, 5));
84    /// let target = Some(Array1::zeros(100));
85    /// let dataset = Dataset::new(data, target);
86    /// ```
87    pub fn new(data: Array2<f64>, target: Option<Array1<f64>>) -> Self {
88        Dataset {
89            data,
90            target,
91            targetnames: None,
92            featurenames: None,
93            feature_descriptions: None,
94            description: None,
95            metadata: HashMap::new(),
96        }
97    }
98
99    /// Create a new dataset with the given data, target, and metadata
100    ///
101    /// # Arguments
102    ///
103    /// * `data` - The feature matrix (n_samples, n_features)
104    /// * `target` - Optional target values (n_samples,)
105    /// * `metadata` - Dataset metadata information
106    ///
107    /// # Returns
108    ///
109    /// A new Dataset instance with metadata applied
110    pub fn from_metadata(
111        data: Array2<f64>,
112        target: Option<Array1<f64>>,
113        metadata: crate::registry::DatasetMetadata,
114    ) -> Self {
115        let mut dataset_metadata = HashMap::new();
116        dataset_metadata.insert("name".to_string(), metadata.name);
117        dataset_metadata.insert("task_type".to_string(), metadata.task_type);
118        dataset_metadata.insert("n_samples".to_string(), metadata.n_samples.to_string());
119        dataset_metadata.insert("n_features".to_string(), metadata.n_features.to_string());
120
121        Dataset {
122            data,
123            target,
124            targetnames: metadata.targetnames,
125            featurenames: None,
126            feature_descriptions: None,
127            description: Some(metadata.description),
128            metadata: dataset_metadata,
129        }
130    }
131
132    /// Add target names to the dataset (builder pattern)
133    ///
134    /// # Arguments
135    ///
136    /// * `targetnames` - Vector of target class names
137    ///
138    /// # Returns
139    ///
140    /// Self for method chaining
141    pub fn with_targetnames(mut self, targetnames: Vec<String>) -> Self {
142        self.targetnames = Some(targetnames);
143        self
144    }
145
146    /// Add feature names to the dataset (builder pattern)
147    ///
148    /// # Arguments
149    ///
150    /// * `featurenames` - Vector of feature names
151    ///
152    /// # Returns
153    ///
154    /// Self for method chaining
155    pub fn with_featurenames(mut self, featurenames: Vec<String>) -> Self {
156        self.featurenames = Some(featurenames);
157        self
158    }
159
160    /// Add feature descriptions to the dataset (builder pattern)
161    ///
162    /// # Arguments
163    ///
164    /// * `feature_descriptions` - Vector of feature descriptions
165    ///
166    /// # Returns
167    ///
168    /// Self for method chaining
169    pub fn with_feature_descriptions(mut self, featuredescriptions: Vec<String>) -> Self {
170        self.feature_descriptions = Some(featuredescriptions);
171        self
172    }
173
174    /// Add a description to the dataset (builder pattern)
175    ///
176    /// # Arguments
177    ///
178    /// * `description` - Dataset description
179    ///
180    /// # Returns
181    ///
182    /// Self for method chaining
183    pub fn with_description(mut self, description: String) -> Self {
184        self.description = Some(description);
185        self
186    }
187
188    /// Add metadata to the dataset (builder pattern)
189    ///
190    /// # Arguments
191    ///
192    /// * `key` - Metadata key
193    /// * `value` - Metadata value
194    ///
195    /// # Returns
196    ///
197    /// Self for method chaining
198    pub fn with_metadata(mut self, key: &str, value: &str) -> Self {
199        self.metadata.insert(key.to_string(), value.to_string());
200        self
201    }
202
203    /// Get the number of samples in the dataset
204    ///
205    /// # Returns
206    ///
207    /// Number of samples (rows) in the dataset
208    pub fn n_samples(&self) -> usize {
209        self.data.nrows()
210    }
211
212    /// Get the number of features in the dataset
213    ///
214    /// # Returns
215    ///
216    /// Number of features (columns) in the dataset
217    pub fn n_features(&self) -> usize {
218        self.data.ncols()
219    }
220
221    /// Get dataset shape as (n_samples, n_features)
222    ///
223    /// # Returns
224    ///
225    /// Tuple of (n_samples, n_features)
226    pub fn shape(&self) -> (usize, usize) {
227        (self.n_samples(), self.n_features())
228    }
229
230    /// Check if the dataset has target values
231    ///
232    /// # Returns
233    ///
234    /// True if target values are present, false otherwise
235    pub fn has_target(&self) -> bool {
236        self.target.is_some()
237    }
238
239    /// Get a reference to the feature names if available
240    ///
241    /// # Returns
242    ///
243    /// Optional reference to feature names vector
244    pub fn featurenames(&self) -> Option<&Vec<String>> {
245        self.featurenames.as_ref()
246    }
247
248    /// Get a reference to the target names if available
249    ///
250    /// # Returns
251    ///
252    /// Optional reference to target names vector  
253    pub fn targetnames(&self) -> Option<&Vec<String>> {
254        self.targetnames.as_ref()
255    }
256
257    /// Get a reference to the dataset description if available
258    ///
259    /// # Returns
260    ///
261    /// Optional reference to dataset description
262    pub fn description(&self) -> Option<&String> {
263        self.description.as_ref()
264    }
265
266    /// Get a reference to the metadata
267    ///
268    /// # Returns
269    ///
270    /// Reference to metadata HashMap
271    pub fn metadata(&self) -> &HashMap<String, String> {
272        &self.metadata
273    }
274
275    /// Add or update a metadata entry
276    ///
277    /// # Arguments
278    ///
279    /// * `key` - Metadata key
280    /// * `value` - Metadata value
281    pub fn set_metadata(&mut self, key: &str, value: &str) {
282        self.metadata.insert(key.to_string(), value.to_string());
283    }
284
285    /// Get a metadata value by key
286    ///
287    /// # Arguments
288    ///
289    /// * `key` - Metadata key to lookup
290    ///
291    /// # Returns
292    ///
293    /// Optional reference to the metadata value
294    pub fn get_metadata(&self, key: &str) -> Option<&String> {
295        self.metadata.get(key)
296    }
297}
298
299#[cfg(test)]
300mod tests {
301    use super::*;
302    use scirs2_core::ndarray::array;
303
304    #[test]
305    fn test_dataset_creation() {
306        let data = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
307        let target = Some(array![0.0, 1.0, 0.0]);
308
309        let dataset = Dataset::new(data.clone(), target.clone());
310
311        assert_eq!(dataset.n_samples(), 3);
312        assert_eq!(dataset.n_features(), 2);
313        assert_eq!(dataset.shape(), (3, 2));
314        assert!(dataset.has_target());
315        assert_eq!(dataset.data, data);
316        assert_eq!(dataset.target, target);
317    }
318
319    #[test]
320    fn test_dataset_builder_pattern() {
321        let data = array![[1.0, 2.0], [3.0, 4.0]];
322
323        let dataset = Dataset::new(data, None)
324            .with_featurenames(vec!["feat1".to_string(), "feat2".to_string()])
325            .with_description("Test dataset".to_string())
326            .with_metadata("version", "1.0")
327            .with_metadata("author", "test");
328
329        assert_eq!(dataset.featurenames().unwrap().len(), 2);
330        assert_eq!(dataset.description().unwrap(), "Test dataset");
331        assert_eq!(dataset.get_metadata("version").unwrap(), "1.0");
332        assert_eq!(dataset.get_metadata("author").unwrap(), "test");
333    }
334
335    #[test]
336    fn test_dataset_without_target() {
337        let data = array![[1.0, 2.0], [3.0, 4.0]];
338        let dataset = Dataset::new(data, None);
339
340        assert!(!dataset.has_target());
341        assert!(dataset.target.is_none());
342    }
343
344    #[test]
345    fn test_metadata_operations() {
346        let data = array![[1.0, 2.0]];
347        let mut dataset = Dataset::new(data, None);
348
349        dataset.set_metadata("key1", "value1");
350        dataset.set_metadata("key2", "value2");
351
352        assert_eq!(dataset.get_metadata("key1").unwrap(), "value1");
353        assert_eq!(dataset.get_metadata("key2").unwrap(), "value2");
354        assert!(dataset.get_metadata("nonexistent").is_none());
355
356        // Update existing key
357        dataset.set_metadata("key1", "updated_value");
358        assert_eq!(dataset.get_metadata("key1").unwrap(), "updated_value");
359    }
360}