sklears_core/dataset/core.rs
1/// Core Dataset structure and fundamental operations
2///
3/// This module contains the primary Dataset structure and its basic methods.
4use crate::types::{Array1, Array2, Float};
5
6/// A simple dataset structure for machine learning operations
7///
8/// The Dataset struct is the primary data container for sklears, holding
9/// feature matrices and target values along with metadata.
10///
11/// # Type Parameters
12///
13/// - `X`: Type of the feature matrix (defaults to `Array2`<Float>``)
14/// - `Y`: Type of the target values (defaults to `Array1`<Float>``)
15///
16/// # Examples
17///
18/// ```rust
19/// use sklears_core::dataset::Dataset;
20/// use scirs2_core::ndarray::{Array1, Array2};
21///
22/// let features = Array2::`<f64>`::zeros((100, 4));
23/// let targets = Array1::`<f64>`::zeros(100);
24/// let dataset = Dataset::new(features, targets)
25/// .with_description("Sample dataset".to_string());
26/// ```
27#[derive(Debug, Clone)]
28pub struct Dataset<X = Array2<Float>, Y = Array1<Float>> {
29 /// Feature matrix (n_samples x n_features)
30 pub data: X,
31 /// Target values (n_samples,)
32 pub target: Y,
33 /// Feature names for interpretability
34 pub feature_names: Vec<String>,
35 /// Target names (for classification tasks)
36 pub target_names: Option<Vec<String>>,
37 /// Dataset description for documentation
38 pub description: String,
39}
40
41impl<X, Y> Dataset<X, Y> {
42 /// Create a new dataset with the given data and target
43 ///
44 /// This is the primary constructor for creating datasets. Additional
45 /// metadata can be added using builder methods.
46 ///
47 /// # Arguments
48 ///
49 /// * `data` - Feature matrix or data structure
50 /// * `target` - Target values corresponding to the features
51 ///
52 /// # Returns
53 ///
54 /// A new Dataset instance with empty metadata
55 pub fn new(data: X, target: Y) -> Self {
56 Self {
57 data,
58 target,
59 feature_names: Vec::new(),
60 target_names: None,
61 description: String::new(),
62 }
63 }
64
65 /// Create a builder for constructing a dataset with compile-time validation
66 ///
67 /// The builder pattern provides compile-time guarantees that both data
68 /// and targets are provided before the dataset can be constructed.
69 ///
70 /// # Returns
71 ///
72 /// A DatasetBuilder in its initial state
73 pub fn builder() -> crate::dataset::builder::DatasetBuilder<
74 X,
75 Y,
76 crate::dataset::builder::NoData,
77 crate::dataset::builder::NoTarget,
78 > {
79 crate::dataset::builder::DatasetBuilder::new()
80 }
81
82 /// Set feature names for the dataset
83 ///
84 /// Feature names improve interpretability and are used in various
85 /// visualization and analysis tools.
86 ///
87 /// # Arguments
88 ///
89 /// * `names` - Vector of feature names
90 ///
91 /// # Returns
92 ///
93 /// Self with updated feature names
94 pub fn with_feature_names(mut self, names: Vec<String>) -> Self {
95 self.feature_names = names;
96 self
97 }
98
99 /// Set target names for classification tasks
100 ///
101 /// Target names are particularly useful for multi-class classification
102 /// where class labels need to be interpretable.
103 ///
104 /// # Arguments
105 ///
106 /// * `names` - Vector of class/target names
107 ///
108 /// # Returns
109 ///
110 /// Self with updated target names
111 pub fn with_target_names(mut self, names: Vec<String>) -> Self {
112 self.target_names = Some(names);
113 self
114 }
115
116 /// Set a description for the dataset
117 ///
118 /// Descriptions are useful for documenting the source, preprocessing
119 /// steps, or other relevant information about the dataset.
120 ///
121 /// # Arguments
122 ///
123 /// * `description` - String description of the dataset
124 ///
125 /// # Returns
126 ///
127 /// Self with updated description
128 pub fn with_description(mut self, description: String) -> Self {
129 self.description = description;
130 self
131 }
132
133 /// Get the number of samples in the dataset
134 ///
135 /// This is a convenience method that should be implemented by
136 /// types that can determine their sample count.
137 pub fn n_samples(&self) -> Option<usize>
138 where
139 X: HasShape,
140 {
141 self.data.shape().map(|(n_samples, _)| n_samples)
142 }
143
144 /// Get the number of features in the dataset
145 ///
146 /// This is a convenience method that should be implemented by
147 /// types that can determine their feature count.
148 pub fn n_features(&self) -> Option<usize>
149 where
150 X: HasShape,
151 {
152 self.data.shape().map(|(_, n_features)| n_features)
153 }
154}
155
156/// Trait for types that can provide shape information
157///
158/// This trait allows the Dataset to work with different backend types
159/// while still providing shape information when available.
160pub trait HasShape {
161 /// Get the shape as (n_samples, n_features) if available
162 fn shape(&self) -> Option<(usize, usize)>;
163}
164
165/// Implementation for ndarray Array2
166impl HasShape for Array2<Float> {
167 fn shape(&self) -> Option<(usize, usize)> {
168 let dim = self.dim();
169 Some((dim.0, dim.1))
170 }
171}
172
173// /// Implementation for generic ndarray types (commented out due to conflicting implementations)
174// impl<T, S> HasShape for ndarray::ArrayBase<S, ndarray::Ix2>
175// where
176// S: ndarray::Data<Elem = T>,
177// {
178// fn shape(&self) -> Option<(usize, usize)> {
179// let dim = self.dim();
180// Some((dim.0, dim.1))
181// }
182// }
183
184#[allow(non_snake_case)]
185#[cfg(test)]
186mod tests {
187 use super::*;
188 use scirs2_core::ndarray::Array1;
189
190 #[test]
191 fn test_dataset_creation() {
192 let data = Array2::<f64>::zeros((10, 3));
193 let target = Array1::<f64>::zeros(10);
194
195 let dataset = Dataset::new(data, target)
196 .with_description("Test dataset".to_string())
197 .with_feature_names(vec!["f1".to_string(), "f2".to_string(), "f3".to_string()]);
198
199 assert_eq!(dataset.description, "Test dataset");
200 assert_eq!(dataset.feature_names.len(), 3);
201 assert_eq!(dataset.n_samples(), Some(10));
202 assert_eq!(dataset.n_features(), Some(3));
203 }
204
205 #[test]
206 fn test_dataset_with_target_names() {
207 let data = Array2::<f64>::zeros((5, 2));
208 let target = Array1::<f64>::zeros(5);
209
210 let dataset = Dataset::new(data, target)
211 .with_target_names(vec!["class_a".to_string(), "class_b".to_string()]);
212
213 assert!(dataset.target_names.is_some());
214 assert_eq!(dataset.target_names.as_ref().unwrap().len(), 2);
215 }
216}