sklears_core/dataset/
core.rs

1/// Core Dataset structure and fundamental operations
2///
3/// This module contains the primary Dataset structure and its basic methods.
4use crate::types::{Array1, Array2, Float};
5
6/// A simple dataset structure for machine learning operations
7///
8/// The Dataset struct is the primary data container for sklears, holding
9/// feature matrices and target values along with metadata.
10///
11/// # Type Parameters
12///
13/// - `X`: Type of the feature matrix (defaults to `Array2`<Float>``)
14/// - `Y`: Type of the target values (defaults to `Array1`<Float>``)
15///
16/// # Examples
17///
18/// ```rust
19/// use sklears_core::dataset::Dataset;
20/// use scirs2_core::ndarray::{Array1, Array2};
21///
22/// let features = Array2::`<f64>`::zeros((100, 4));
23/// let targets = Array1::`<f64>`::zeros(100);
24/// let dataset = Dataset::new(features, targets)
25///     .with_description("Sample dataset".to_string());
26/// ```
27#[derive(Debug, Clone)]
28pub struct Dataset<X = Array2<Float>, Y = Array1<Float>> {
29    /// Feature matrix (n_samples x n_features)
30    pub data: X,
31    /// Target values (n_samples,)
32    pub target: Y,
33    /// Feature names for interpretability
34    pub feature_names: Vec<String>,
35    /// Target names (for classification tasks)
36    pub target_names: Option<Vec<String>>,
37    /// Dataset description for documentation
38    pub description: String,
39}
40
41impl<X, Y> Dataset<X, Y> {
42    /// Create a new dataset with the given data and target
43    ///
44    /// This is the primary constructor for creating datasets. Additional
45    /// metadata can be added using builder methods.
46    ///
47    /// # Arguments
48    ///
49    /// * `data` - Feature matrix or data structure
50    /// * `target` - Target values corresponding to the features
51    ///
52    /// # Returns
53    ///
54    /// A new Dataset instance with empty metadata
55    pub fn new(data: X, target: Y) -> Self {
56        Self {
57            data,
58            target,
59            feature_names: Vec::new(),
60            target_names: None,
61            description: String::new(),
62        }
63    }
64
65    /// Create a builder for constructing a dataset with compile-time validation
66    ///
67    /// The builder pattern provides compile-time guarantees that both data
68    /// and targets are provided before the dataset can be constructed.
69    ///
70    /// # Returns
71    ///
72    /// A DatasetBuilder in its initial state
73    pub fn builder() -> crate::dataset::builder::DatasetBuilder<
74        X,
75        Y,
76        crate::dataset::builder::NoData,
77        crate::dataset::builder::NoTarget,
78    > {
79        crate::dataset::builder::DatasetBuilder::new()
80    }
81
82    /// Set feature names for the dataset
83    ///
84    /// Feature names improve interpretability and are used in various
85    /// visualization and analysis tools.
86    ///
87    /// # Arguments
88    ///
89    /// * `names` - Vector of feature names
90    ///
91    /// # Returns
92    ///
93    /// Self with updated feature names
94    pub fn with_feature_names(mut self, names: Vec<String>) -> Self {
95        self.feature_names = names;
96        self
97    }
98
99    /// Set target names for classification tasks
100    ///
101    /// Target names are particularly useful for multi-class classification
102    /// where class labels need to be interpretable.
103    ///
104    /// # Arguments
105    ///
106    /// * `names` - Vector of class/target names
107    ///
108    /// # Returns
109    ///
110    /// Self with updated target names
111    pub fn with_target_names(mut self, names: Vec<String>) -> Self {
112        self.target_names = Some(names);
113        self
114    }
115
116    /// Set a description for the dataset
117    ///
118    /// Descriptions are useful for documenting the source, preprocessing
119    /// steps, or other relevant information about the dataset.
120    ///
121    /// # Arguments
122    ///
123    /// * `description` - String description of the dataset
124    ///
125    /// # Returns
126    ///
127    /// Self with updated description
128    pub fn with_description(mut self, description: String) -> Self {
129        self.description = description;
130        self
131    }
132
133    /// Get the number of samples in the dataset
134    ///
135    /// This is a convenience method that should be implemented by
136    /// types that can determine their sample count.
137    pub fn n_samples(&self) -> Option<usize>
138    where
139        X: HasShape,
140    {
141        self.data.shape().map(|(n_samples, _)| n_samples)
142    }
143
144    /// Get the number of features in the dataset
145    ///
146    /// This is a convenience method that should be implemented by
147    /// types that can determine their feature count.
148    pub fn n_features(&self) -> Option<usize>
149    where
150        X: HasShape,
151    {
152        self.data.shape().map(|(_, n_features)| n_features)
153    }
154}
155
156/// Trait for types that can provide shape information
157///
158/// This trait allows the Dataset to work with different backend types
159/// while still providing shape information when available.
160pub trait HasShape {
161    /// Get the shape as (n_samples, n_features) if available
162    fn shape(&self) -> Option<(usize, usize)>;
163}
164
165/// Implementation for ndarray Array2
166impl HasShape for Array2<Float> {
167    fn shape(&self) -> Option<(usize, usize)> {
168        let dim = self.dim();
169        Some((dim.0, dim.1))
170    }
171}
172
173// /// Implementation for generic ndarray types (commented out due to conflicting implementations)
174// impl<T, S> HasShape for ndarray::ArrayBase<S, ndarray::Ix2>
175// where
176//     S: ndarray::Data<Elem = T>,
177// {
178//     fn shape(&self) -> Option<(usize, usize)> {
179//         let dim = self.dim();
180//         Some((dim.0, dim.1))
181//     }
182// }
183
184#[allow(non_snake_case)]
185#[cfg(test)]
186mod tests {
187    use super::*;
188    use scirs2_core::ndarray::Array1;
189
190    #[test]
191    fn test_dataset_creation() {
192        let data = Array2::<f64>::zeros((10, 3));
193        let target = Array1::<f64>::zeros(10);
194
195        let dataset = Dataset::new(data, target)
196            .with_description("Test dataset".to_string())
197            .with_feature_names(vec!["f1".to_string(), "f2".to_string(), "f3".to_string()]);
198
199        assert_eq!(dataset.description, "Test dataset");
200        assert_eq!(dataset.feature_names.len(), 3);
201        assert_eq!(dataset.n_samples(), Some(10));
202        assert_eq!(dataset.n_features(), Some(3));
203    }
204
205    #[test]
206    fn test_dataset_with_target_names() {
207        let data = Array2::<f64>::zeros((5, 2));
208        let target = Array1::<f64>::zeros(5);
209
210        let dataset = Dataset::new(data, target)
211            .with_target_names(vec!["class_a".to_string(), "class_b".to_string()]);
212
213        assert!(dataset.target_names.is_some());
214        assert_eq!(dataset.target_names.as_ref().unwrap().len(), 2);
215    }
216}