pub struct Dataset {
pub data: Array2<f64>,
pub target: Option<Array1<f64>>,
pub target_names: Option<Vec<String>>,
pub feature_names: Option<Vec<String>>,
pub feature_descriptions: Option<Vec<String>>,
pub description: Option<String>,
pub metadata: HashMap<String, String>,
}
Expand description
Represents a dataset with features, optional targets, and metadata
The Dataset struct is the core data structure for managing machine learning datasets. It stores the feature matrix, optional target values, and rich metadata including feature names, descriptions, and arbitrary key-value pairs.
§Examples
use ndarray::Array2;
use scirs2_datasets::utils::Dataset;
let data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let dataset = Dataset::new(data, None)
.with_feature_names(vec!["feature1".to_string(), "feature2".to_string()])
.with_description("Sample dataset".to_string());
assert_eq!(dataset.n_samples(), 3);
assert_eq!(dataset.n_features(), 2);
Fields§
§data: Array2<f64>
Features/data matrix (n_samples, n_features)
target: Option<Array1<f64>>
Optional target values
target_names: Option<Vec<String>>
Optional target names for classification problems
feature_names: Option<Vec<String>>
Optional feature names
feature_descriptions: Option<Vec<String>>
Optional descriptions for each feature
description: Option<String>
Optional dataset description
metadata: HashMap<String, String>
Optional dataset metadata
Implementations§
Source§impl Dataset
impl Dataset
Sourcepub fn new(data: Array2<f64>, target: Option<Array1<f64>>) -> Self
pub fn new(data: Array2<f64>, target: Option<Array1<f64>>) -> Self
Create a new dataset with the given data and target
§Arguments
data
- The feature matrix (n_samples, n_features)target
- Optional target values (n_samples,)
§Returns
A new Dataset instance with empty metadata
§Examples
use ndarray::{Array1, Array2};
use scirs2_datasets::utils::Dataset;
let data = Array2::zeros((100, 5));
let target = Some(Array1::zeros(100));
let dataset = Dataset::new(data, target);
Examples found in repository?
9fn main() {
10 println!("=== Cross-Validation Demonstration ===\n");
11
12 // Create sample dataset
13 let data = Array2::from_shape_vec((20, 3), (0..60).map(|x| x as f64 / 10.0).collect()).unwrap();
14 let target = Array1::from(
15 (0..20)
16 .map(|i| if i % 2 == 0 { 0.0 } else { 1.0 })
17 .collect::<Vec<_>>(),
18 );
19
20 let dataset = Dataset::new(data.clone(), Some(target.clone()))
21 .with_description("Sample dataset for cross-validation demo".to_string());
22
23 println!("Dataset info:");
24 println!("- Samples: {}", dataset.n_samples());
25 println!("- Features: {}", dataset.n_features());
26 println!("- Description: {}\n", dataset.description.as_ref().unwrap());
27
28 // Demonstrate K-fold cross-validation
29 println!("=== K-Fold Cross-Validation (k=5) ===");
30 let k_folds = k_fold_split(dataset.n_samples(), 5, true, Some(42)).unwrap();
31
32 for (i, (train_indices, val_indices)) in k_folds.iter().enumerate() {
33 println!(
34 "Fold {}: Train size: {}, Validation size: {}",
35 i + 1,
36 train_indices.len(),
37 val_indices.len()
38 );
39 println!(
40 " Train indices: {:?}",
41 &train_indices[..5.min(train_indices.len())]
42 );
43 println!(" Val indices: {:?}", val_indices);
44 }
45 println!();
46
47 // Demonstrate Stratified K-fold cross-validation
48 println!("=== Stratified K-Fold Cross-Validation (k=4) ===");
49 let stratified_folds = stratified_k_fold_split(&target, 4, true, Some(42)).unwrap();
50
51 for (i, (train_indices, val_indices)) in stratified_folds.iter().enumerate() {
52 // Calculate class distribution in validation set
53 let val_targets: Vec<f64> = val_indices.iter().map(|&idx| target[idx]).collect();
54 let class_0_count = val_targets.iter().filter(|&&x| x == 0.0).count();
55 let class_1_count = val_targets.iter().filter(|&&x| x == 1.0).count();
56
57 println!(
58 "Fold {}: Train size: {}, Validation size: {}",
59 i + 1,
60 train_indices.len(),
61 val_indices.len()
62 );
63 println!(
64 " Class distribution in validation: Class 0: {}, Class 1: {}",
65 class_0_count, class_1_count
66 );
67 }
68 println!();
69
70 // Demonstrate Time Series cross-validation
71 println!("=== Time Series Cross-Validation ===");
72 let ts_folds = time_series_split(dataset.n_samples(), 3, 3, 1).unwrap();
73
74 for (i, (train_indices, val_indices)) in ts_folds.iter().enumerate() {
75 println!(
76 "Split {}: Train size: {}, Test size: {}",
77 i + 1,
78 train_indices.len(),
79 val_indices.len()
80 );
81 println!(
82 " Train range: {} to {}",
83 train_indices.first().unwrap_or(&0),
84 train_indices.last().unwrap_or(&0)
85 );
86 println!(
87 " Test range: {} to {}",
88 val_indices.first().unwrap_or(&0),
89 val_indices.last().unwrap_or(&0)
90 );
91 }
92 println!();
93
94 // Demonstrate usage with Dataset methods
95 println!("=== Using Cross-Validation with Dataset ===");
96 let first_fold = &k_folds[0];
97 let (train_indices, val_indices) = first_fold;
98
99 // Create training subset
100 let train_data = data.select(ndarray::Axis(0), train_indices);
101 let train_target = target.select(ndarray::Axis(0), train_indices);
102 let train_dataset = Dataset::new(train_data, Some(train_target))
103 .with_description("Training fold from K-fold CV".to_string());
104
105 // Create validation subset
106 let val_data = data.select(ndarray::Axis(0), val_indices);
107 let val_target = target.select(ndarray::Axis(0), val_indices);
108 let val_dataset = Dataset::new(val_data, Some(val_target))
109 .with_description("Validation fold from K-fold CV".to_string());
110
111 println!(
112 "Training dataset: {} samples, {} features",
113 train_dataset.n_samples(),
114 train_dataset.n_features()
115 );
116 println!(
117 "Validation dataset: {} samples, {} features",
118 val_dataset.n_samples(),
119 val_dataset.n_features()
120 );
121
122 println!("\n=== Cross-Validation Demo Complete ===");
123}
More examples
9fn main() {
10 println!("=== Sampling and Bootstrapping Demonstration ===\n");
11
12 // Load the Iris dataset for demonstration
13 let iris = load_iris().unwrap();
14 let n_samples = iris.n_samples();
15
16 println!("Original Iris dataset:");
17 println!("- Samples: {}", n_samples);
18 println!("- Features: {}", iris.n_features());
19
20 if let Some(target) = &iris.target {
21 let class_counts = count_classes(target);
22 println!("- Class distribution: {:?}\n", class_counts);
23 }
24
25 // Demonstrate random sampling without replacement
26 println!("=== Random Sampling (without replacement) ===");
27 let sample_size = 30;
28 let random_indices = random_sample(n_samples, sample_size, false, Some(42)).unwrap();
29
30 println!(
31 "Sampled {} indices from {} total samples",
32 sample_size, n_samples
33 );
34 println!(
35 "Sample indices: {:?}",
36 &random_indices[..10.min(random_indices.len())]
37 );
38
39 // Create a subset dataset
40 let sample_data = iris.data.select(ndarray::Axis(0), &random_indices);
41 let sample_target = iris
42 .target
43 .as_ref()
44 .map(|t| t.select(ndarray::Axis(0), &random_indices));
45 let sample_dataset = Dataset::new(sample_data, sample_target)
46 .with_description("Random sample from Iris dataset".to_string());
47
48 println!(
49 "Random sample dataset: {} samples, {} features",
50 sample_dataset.n_samples(),
51 sample_dataset.n_features()
52 );
53
54 if let Some(target) = &sample_dataset.target {
55 let sample_class_counts = count_classes(target);
56 println!("Sample class distribution: {:?}\n", sample_class_counts);
57 }
58
59 // Demonstrate bootstrap sampling (with replacement)
60 println!("=== Bootstrap Sampling (with replacement) ===");
61 let bootstrap_size = 200; // More than original dataset size
62 let bootstrap_indices = random_sample(n_samples, bootstrap_size, true, Some(42)).unwrap();
63
64 println!(
65 "Bootstrap sampled {} indices from {} total samples",
66 bootstrap_size, n_samples
67 );
68 println!(
69 "Bootstrap may have duplicates - first 10 indices: {:?}",
70 &bootstrap_indices[..10]
71 );
72
73 // Count frequency of each index in bootstrap sample
74 let mut index_counts = vec![0; n_samples];
75 for &idx in &bootstrap_indices {
76 index_counts[idx] += 1;
77 }
78 let max_count = *index_counts.iter().max().unwrap();
79 let zero_count = index_counts.iter().filter(|&&count| count == 0).count();
80
81 println!("Bootstrap statistics:");
82 println!("- Maximum frequency of any sample: {}", max_count);
83 println!(
84 "- Number of original samples not selected: {}\n",
85 zero_count
86 );
87
88 // Demonstrate stratified sampling
89 println!("=== Stratified Sampling ===");
90 if let Some(target) = &iris.target {
91 let stratified_size = 30;
92 let stratified_indices = stratified_sample(target, stratified_size, Some(42)).unwrap();
93
94 println!(
95 "Stratified sampled {} indices maintaining class proportions",
96 stratified_size
97 );
98
99 // Create stratified subset
100 let stratified_data = iris.data.select(ndarray::Axis(0), &stratified_indices);
101 let stratified_target = target.select(ndarray::Axis(0), &stratified_indices);
102 let stratified_dataset = Dataset::new(stratified_data, Some(stratified_target))
103 .with_description("Stratified sample from Iris dataset".to_string());
104
105 println!(
106 "Stratified sample dataset: {} samples, {} features",
107 stratified_dataset.n_samples(),
108 stratified_dataset.n_features()
109 );
110
111 let stratified_class_counts = count_classes(&stratified_dataset.target.unwrap());
112 println!(
113 "Stratified sample class distribution: {:?}",
114 stratified_class_counts
115 );
116
117 // Verify proportions are maintained
118 let original_proportions = calculate_proportions(&count_classes(target));
119 let stratified_proportions = calculate_proportions(&stratified_class_counts);
120
121 println!("Class proportion comparison:");
122 for (&class, &original_prop) in &original_proportions {
123 let stratified_prop = stratified_proportions.get(&class).unwrap_or(&0.0);
124 println!(
125 " Class {}: Original {:.2}%, Stratified {:.2}%",
126 class,
127 original_prop * 100.0,
128 stratified_prop * 100.0
129 );
130 }
131 }
132
133 // Demonstrate practical use case: creating training/validation splits
134 println!("\n=== Practical Example: Multiple Train/Validation Splits ===");
135 for i in 1..=3 {
136 let split_indices = random_sample(n_samples, 100, false, Some(42 + i)).unwrap();
137 let (train_indices, val_indices) = split_indices.split_at(80);
138
139 println!(
140 "Split {}: {} training samples, {} validation samples",
141 i,
142 train_indices.len(),
143 val_indices.len()
144 );
145 }
146
147 println!("\n=== Sampling Demo Complete ===");
148}
Sourcepub fn with_target_names(self, target_names: Vec<String>) -> Self
pub fn with_target_names(self, target_names: Vec<String>) -> Self
Sourcepub fn with_feature_names(self, feature_names: Vec<String>) -> Self
pub fn with_feature_names(self, feature_names: Vec<String>) -> Self
Sourcepub fn with_feature_descriptions(
self,
feature_descriptions: Vec<String>,
) -> Self
pub fn with_feature_descriptions( self, feature_descriptions: Vec<String>, ) -> Self
Sourcepub fn with_description(self, description: String) -> Self
pub fn with_description(self, description: String) -> Self
Add a description to the dataset (builder pattern)
§Arguments
description
- Dataset description
§Returns
Self for method chaining
Examples found in repository?
9fn main() {
10 println!("=== Cross-Validation Demonstration ===\n");
11
12 // Create sample dataset
13 let data = Array2::from_shape_vec((20, 3), (0..60).map(|x| x as f64 / 10.0).collect()).unwrap();
14 let target = Array1::from(
15 (0..20)
16 .map(|i| if i % 2 == 0 { 0.0 } else { 1.0 })
17 .collect::<Vec<_>>(),
18 );
19
20 let dataset = Dataset::new(data.clone(), Some(target.clone()))
21 .with_description("Sample dataset for cross-validation demo".to_string());
22
23 println!("Dataset info:");
24 println!("- Samples: {}", dataset.n_samples());
25 println!("- Features: {}", dataset.n_features());
26 println!("- Description: {}\n", dataset.description.as_ref().unwrap());
27
28 // Demonstrate K-fold cross-validation
29 println!("=== K-Fold Cross-Validation (k=5) ===");
30 let k_folds = k_fold_split(dataset.n_samples(), 5, true, Some(42)).unwrap();
31
32 for (i, (train_indices, val_indices)) in k_folds.iter().enumerate() {
33 println!(
34 "Fold {}: Train size: {}, Validation size: {}",
35 i + 1,
36 train_indices.len(),
37 val_indices.len()
38 );
39 println!(
40 " Train indices: {:?}",
41 &train_indices[..5.min(train_indices.len())]
42 );
43 println!(" Val indices: {:?}", val_indices);
44 }
45 println!();
46
47 // Demonstrate Stratified K-fold cross-validation
48 println!("=== Stratified K-Fold Cross-Validation (k=4) ===");
49 let stratified_folds = stratified_k_fold_split(&target, 4, true, Some(42)).unwrap();
50
51 for (i, (train_indices, val_indices)) in stratified_folds.iter().enumerate() {
52 // Calculate class distribution in validation set
53 let val_targets: Vec<f64> = val_indices.iter().map(|&idx| target[idx]).collect();
54 let class_0_count = val_targets.iter().filter(|&&x| x == 0.0).count();
55 let class_1_count = val_targets.iter().filter(|&&x| x == 1.0).count();
56
57 println!(
58 "Fold {}: Train size: {}, Validation size: {}",
59 i + 1,
60 train_indices.len(),
61 val_indices.len()
62 );
63 println!(
64 " Class distribution in validation: Class 0: {}, Class 1: {}",
65 class_0_count, class_1_count
66 );
67 }
68 println!();
69
70 // Demonstrate Time Series cross-validation
71 println!("=== Time Series Cross-Validation ===");
72 let ts_folds = time_series_split(dataset.n_samples(), 3, 3, 1).unwrap();
73
74 for (i, (train_indices, val_indices)) in ts_folds.iter().enumerate() {
75 println!(
76 "Split {}: Train size: {}, Test size: {}",
77 i + 1,
78 train_indices.len(),
79 val_indices.len()
80 );
81 println!(
82 " Train range: {} to {}",
83 train_indices.first().unwrap_or(&0),
84 train_indices.last().unwrap_or(&0)
85 );
86 println!(
87 " Test range: {} to {}",
88 val_indices.first().unwrap_or(&0),
89 val_indices.last().unwrap_or(&0)
90 );
91 }
92 println!();
93
94 // Demonstrate usage with Dataset methods
95 println!("=== Using Cross-Validation with Dataset ===");
96 let first_fold = &k_folds[0];
97 let (train_indices, val_indices) = first_fold;
98
99 // Create training subset
100 let train_data = data.select(ndarray::Axis(0), train_indices);
101 let train_target = target.select(ndarray::Axis(0), train_indices);
102 let train_dataset = Dataset::new(train_data, Some(train_target))
103 .with_description("Training fold from K-fold CV".to_string());
104
105 // Create validation subset
106 let val_data = data.select(ndarray::Axis(0), val_indices);
107 let val_target = target.select(ndarray::Axis(0), val_indices);
108 let val_dataset = Dataset::new(val_data, Some(val_target))
109 .with_description("Validation fold from K-fold CV".to_string());
110
111 println!(
112 "Training dataset: {} samples, {} features",
113 train_dataset.n_samples(),
114 train_dataset.n_features()
115 );
116 println!(
117 "Validation dataset: {} samples, {} features",
118 val_dataset.n_samples(),
119 val_dataset.n_features()
120 );
121
122 println!("\n=== Cross-Validation Demo Complete ===");
123}
More examples
9fn main() {
10 println!("=== Sampling and Bootstrapping Demonstration ===\n");
11
12 // Load the Iris dataset for demonstration
13 let iris = load_iris().unwrap();
14 let n_samples = iris.n_samples();
15
16 println!("Original Iris dataset:");
17 println!("- Samples: {}", n_samples);
18 println!("- Features: {}", iris.n_features());
19
20 if let Some(target) = &iris.target {
21 let class_counts = count_classes(target);
22 println!("- Class distribution: {:?}\n", class_counts);
23 }
24
25 // Demonstrate random sampling without replacement
26 println!("=== Random Sampling (without replacement) ===");
27 let sample_size = 30;
28 let random_indices = random_sample(n_samples, sample_size, false, Some(42)).unwrap();
29
30 println!(
31 "Sampled {} indices from {} total samples",
32 sample_size, n_samples
33 );
34 println!(
35 "Sample indices: {:?}",
36 &random_indices[..10.min(random_indices.len())]
37 );
38
39 // Create a subset dataset
40 let sample_data = iris.data.select(ndarray::Axis(0), &random_indices);
41 let sample_target = iris
42 .target
43 .as_ref()
44 .map(|t| t.select(ndarray::Axis(0), &random_indices));
45 let sample_dataset = Dataset::new(sample_data, sample_target)
46 .with_description("Random sample from Iris dataset".to_string());
47
48 println!(
49 "Random sample dataset: {} samples, {} features",
50 sample_dataset.n_samples(),
51 sample_dataset.n_features()
52 );
53
54 if let Some(target) = &sample_dataset.target {
55 let sample_class_counts = count_classes(target);
56 println!("Sample class distribution: {:?}\n", sample_class_counts);
57 }
58
59 // Demonstrate bootstrap sampling (with replacement)
60 println!("=== Bootstrap Sampling (with replacement) ===");
61 let bootstrap_size = 200; // More than original dataset size
62 let bootstrap_indices = random_sample(n_samples, bootstrap_size, true, Some(42)).unwrap();
63
64 println!(
65 "Bootstrap sampled {} indices from {} total samples",
66 bootstrap_size, n_samples
67 );
68 println!(
69 "Bootstrap may have duplicates - first 10 indices: {:?}",
70 &bootstrap_indices[..10]
71 );
72
73 // Count frequency of each index in bootstrap sample
74 let mut index_counts = vec![0; n_samples];
75 for &idx in &bootstrap_indices {
76 index_counts[idx] += 1;
77 }
78 let max_count = *index_counts.iter().max().unwrap();
79 let zero_count = index_counts.iter().filter(|&&count| count == 0).count();
80
81 println!("Bootstrap statistics:");
82 println!("- Maximum frequency of any sample: {}", max_count);
83 println!(
84 "- Number of original samples not selected: {}\n",
85 zero_count
86 );
87
88 // Demonstrate stratified sampling
89 println!("=== Stratified Sampling ===");
90 if let Some(target) = &iris.target {
91 let stratified_size = 30;
92 let stratified_indices = stratified_sample(target, stratified_size, Some(42)).unwrap();
93
94 println!(
95 "Stratified sampled {} indices maintaining class proportions",
96 stratified_size
97 );
98
99 // Create stratified subset
100 let stratified_data = iris.data.select(ndarray::Axis(0), &stratified_indices);
101 let stratified_target = target.select(ndarray::Axis(0), &stratified_indices);
102 let stratified_dataset = Dataset::new(stratified_data, Some(stratified_target))
103 .with_description("Stratified sample from Iris dataset".to_string());
104
105 println!(
106 "Stratified sample dataset: {} samples, {} features",
107 stratified_dataset.n_samples(),
108 stratified_dataset.n_features()
109 );
110
111 let stratified_class_counts = count_classes(&stratified_dataset.target.unwrap());
112 println!(
113 "Stratified sample class distribution: {:?}",
114 stratified_class_counts
115 );
116
117 // Verify proportions are maintained
118 let original_proportions = calculate_proportions(&count_classes(target));
119 let stratified_proportions = calculate_proportions(&stratified_class_counts);
120
121 println!("Class proportion comparison:");
122 for (&class, &original_prop) in &original_proportions {
123 let stratified_prop = stratified_proportions.get(&class).unwrap_or(&0.0);
124 println!(
125 " Class {}: Original {:.2}%, Stratified {:.2}%",
126 class,
127 original_prop * 100.0,
128 stratified_prop * 100.0
129 );
130 }
131 }
132
133 // Demonstrate practical use case: creating training/validation splits
134 println!("\n=== Practical Example: Multiple Train/Validation Splits ===");
135 for i in 1..=3 {
136 let split_indices = random_sample(n_samples, 100, false, Some(42 + i)).unwrap();
137 let (train_indices, val_indices) = split_indices.split_at(80);
138
139 println!(
140 "Split {}: {} training samples, {} validation samples",
141 i,
142 train_indices.len(),
143 val_indices.len()
144 );
145 }
146
147 println!("\n=== Sampling Demo Complete ===");
148}
Sourcepub fn with_metadata(self, key: &str, value: &str) -> Self
pub fn with_metadata(self, key: &str, value: &str) -> Self
Sourcepub fn n_samples(&self) -> usize
pub fn n_samples(&self) -> usize
Examples found in repository?
3fn main() -> Result<(), Box<dyn std::error::Error>> {
4 let iris = load_iris()?;
5 println!("Iris dataset loaded:");
6 println!(" Samples: {}", iris.n_samples());
7 println!(" Features: {}", iris.n_features());
8 println!(
9 " Target classes: {}",
10 iris.target_names.as_ref().map_or(0, |v| v.len())
11 );
12
13 let boston = load_boston()?;
14 println!("\nBoston Housing dataset loaded:");
15 println!(" Samples: {}", boston.n_samples());
16 println!(" Features: {}", boston.n_features());
17
18 Ok(())
19}
More examples
3fn main() -> Result<(), Box<dyn std::error::Error>> {
4 // Load a CSV file with headers and target column
5 let dataset = load_csv(
6 "scirs2-datasets/data/example.csv",
7 true, // has header
8 Some(3), // target column index (0-based)
9 )?;
10
11 println!("CSV dataset loaded successfully:");
12 println!(" Samples: {}", dataset.n_samples());
13 println!(" Features: {}", dataset.n_features());
14 println!(" Feature names: {:?}", dataset.feature_names);
15
16 // Access data and target
17 println!("\nFirst 3 samples:");
18 for i in 0..3 {
19 let features = dataset.data.row(i);
20 let target = dataset.target.as_ref().map(|t| t[i]);
21 println!(
22 " Sample {}: Features = {:?}, Target = {:?}",
23 i, features, target
24 );
25 }
26
27 Ok(())
28}
191fn demonstrate_comprehensive_corruption() {
192 println!("Testing comprehensive dataset corruption:");
193
194 // Load a real dataset
195 let iris = load_iris().unwrap();
196 println!(
197 "Original Iris dataset: {} samples, {} features",
198 iris.n_samples(),
199 iris.n_features()
200 );
201
202 let original_stats = calculate_basic_stats(&iris.data);
203 println!(
204 "Original stats - Mean: {:.3}, Std: {:.3}",
205 original_stats.0, original_stats.1
206 );
207
208 // Create different levels of corruption
209 let corruption_levels = [
210 (0.05, 0.02, "Light corruption"),
211 (0.1, 0.05, "Moderate corruption"),
212 (0.2, 0.1, "Heavy corruption"),
213 (0.3, 0.15, "Severe corruption"),
214 ];
215
216 for (missing_rate, outlier_rate, description) in corruption_levels {
217 let corrupted = make_corrupted_dataset(
218 &iris,
219 missing_rate,
220 MissingPattern::MAR, // More realistic than MCAR
221 outlier_rate,
222 OutlierType::Point,
223 2.5,
224 Some(42),
225 )
226 .unwrap();
227
228 // Calculate how much data is usable
229 let total_elements = corrupted.data.len();
230 let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
231 let usable_percentage =
232 ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
233
234 println!("{}:", description);
235 println!(" Missing data: {:.1}%", missing_rate * 100.0);
236 println!(" Outliers: {:.1}%", outlier_rate * 100.0);
237 println!(" Usable data: {:.1}%", usable_percentage);
238
239 // Show metadata
240 if let Some(missing_count) = corrupted.metadata.get("missing_count") {
241 println!(" Actual missing: {} elements", missing_count);
242 }
243 if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
244 println!(" Actual outliers: {} samples", outlier_count);
245 }
246 }
247}
4fn main() -> Result<(), Box<dyn std::error::Error>> {
5 println!("Loading time series datasets...\n");
6
7 // Load the electrocardiogram dataset
8 let ecg = electrocardiogram()?;
9
10 println!("Electrocardiogram dataset:");
11 println!(" Time steps: {}", ecg.n_samples());
12 println!(" Features: {}", ecg.n_features());
13 println!(
14 " Sampling rate: {} Hz",
15 ecg.metadata
16 .get("sampling_rate")
17 .unwrap_or(&"unknown".to_string())
18 );
19 println!(
20 " Duration: {}",
21 ecg.metadata
22 .get("duration")
23 .unwrap_or(&"unknown".to_string())
24 );
25
26 // Get a slice of the data and display basic statistics
27 let ecg_slice = ecg.data.slice(s![0..10, 0]);
28 println!(" First 10 data points: {:?}", ecg_slice);
29
30 // Calculate some basic statistics
31 let ecg_data = ecg.data.column(0);
32 let min = ecg_data.fold(f64::INFINITY, |a, &b| a.min(b));
33 let max = ecg_data.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
34 let mean = ecg_data.sum() / ecg_data.len() as f64;
35
36 println!(" Min: {:.3} mV", min);
37 println!(" Max: {:.3} mV", max);
38 println!(" Mean: {:.3} mV", mean);
39
40 // Note: Stock market and weather datasets are commented out because their source data
41 // is not yet available.
42
43 /*
44 // Load the stock market dataset
45 println!("\nStock market dataset:");
46
47 // Get price changes (returns)
48 let stock_returns = stock_market(true)?;
49 println!(" Time steps: {}", stock_returns.n_samples());
50 println!(" Companies: {}", stock_returns.n_features());
51
52 // Print companies
53 if let Some(feature_names) = &stock_returns.feature_names {
54 println!(" Companies: {}", feature_names.join(", "));
55 }
56
57 // Load the weather dataset
58 println!("\nWeather dataset:");
59 let temp_data = weather(Some("temperature"))?;
60
61 println!(" Time steps: {}", temp_data.n_samples());
62 println!(" Locations: {}", temp_data.n_features());
63 */
64
65 println!("\nTime series dataset loaded successfully!");
66
67 Ok(())
68}
6fn main() {
7 // Check if a CSV file is provided as a command-line argument
8 let args: Vec<String> = env::args().collect();
9 if args.len() < 2 {
10 println!("Usage: {} <path_to_csv_file>", args[0]);
11 println!("Example: {} examples/sample_data.csv", args[0]);
12 return;
13 }
14
15 let file_path = &args[1];
16
17 // Verify the file exists
18 if !Path::new(file_path).exists() {
19 println!("Error: File '{}' does not exist", file_path);
20 return;
21 }
22
23 // Load CSV file
24 println!("Loading CSV file: {}", file_path);
25 match loaders::load_csv(file_path, true, None) {
26 Ok(dataset) => {
27 print_dataset_info(&dataset, "Loaded CSV");
28
29 // Split the dataset for demonstration
30 println!("\nDemonstrating train-test split...");
31 match train_test_split(&dataset, 0.2, Some(42)) {
32 Ok((train, test)) => {
33 println!("Training set: {} samples", train.n_samples());
34 println!("Test set: {} samples", test.n_samples());
35
36 // Save as JSON for demonstration
37 let json_path = format!("{}.json", file_path);
38 println!("\nSaving training dataset to JSON: {}", json_path);
39 if let Err(e) = loaders::save_json(&train, &json_path) {
40 println!("Error saving JSON: {}", e);
41 } else {
42 println!("Successfully saved JSON file");
43
44 // Load back the JSON file
45 println!("\nLoading back from JSON file...");
46 match loaders::load_json(&json_path) {
47 Ok(loaded) => {
48 print_dataset_info(&loaded, "Loaded JSON");
49 }
50 Err(e) => println!("Error loading JSON: {}", e),
51 }
52 }
53 }
54 Err(e) => println!("Error splitting dataset: {}", e),
55 }
56 }
57 Err(e) => println!("Error loading CSV: {}", e),
58 }
59}
60
61fn print_dataset_info(dataset: &Dataset, name: &str) {
62 println!("=== {} Dataset ===", name);
63 println!("Number of samples: {}", dataset.n_samples());
64 println!("Number of features: {}", dataset.n_features());
65
66 if let Some(feature_names) = &dataset.feature_names {
67 println!(
68 "Features: {:?}",
69 &feature_names[0..std::cmp::min(5, feature_names.len())]
70 );
71 if feature_names.len() > 5 {
72 println!("... and {} more", feature_names.len() - 5);
73 }
74 }
75
76 if let Some(target) = &dataset.target {
77 println!("Target shape: {}", target.len());
78
79 if let Some(target_names) = &dataset.target_names {
80 println!("Target classes: {:?}", target_names);
81 }
82 }
83
84 for (key, value) in &dataset.metadata {
85 println!("Metadata - {}: {}", key, value);
86 }
87}
6fn main() -> Result<(), Box<dyn std::error::Error>> {
7 println!("Creating synthetic datasets...\n");
8
9 // Generate classification dataset
10 let n_samples = 100;
11 let n_features = 5;
12
13 let classification_data = make_classification(
14 n_samples,
15 n_features,
16 3, // 3 classes
17 2, // 2 clusters per class
18 3, // 3 informative features
19 Some(42), // random seed
20 )?;
21
22 // Train-test split
23 let (train, test) = train_test_split(&classification_data, 0.2, Some(42))?;
24
25 println!("Classification dataset:");
26 println!(" Total samples: {}", classification_data.n_samples());
27 println!(" Features: {}", classification_data.n_features());
28 println!(" Training samples: {}", train.n_samples());
29 println!(" Test samples: {}", test.n_samples());
30
31 // Generate regression dataset
32 let regression_data = make_regression(
33 n_samples,
34 n_features,
35 3, // 3 informative features
36 0.5, // noise level
37 Some(42),
38 )?;
39
40 println!("\nRegression dataset:");
41 println!(" Samples: {}", regression_data.n_samples());
42 println!(" Features: {}", regression_data.n_features());
43
44 // Normalize the data (in-place)
45 let mut data_copy = regression_data.data.clone();
46 normalize(&mut data_copy);
47 println!(" Data normalized successfully");
48
49 // Generate clustering data (blobs)
50 let clustering_data = make_blobs(
51 n_samples,
52 2, // 2 features for easy visualization
53 4, // 4 clusters
54 0.8, // cluster standard deviation
55 Some(42),
56 )?;
57
58 println!("\nClustering dataset (blobs):");
59 println!(" Samples: {}", clustering_data.n_samples());
60 println!(" Features: {}", clustering_data.n_features());
61
62 // Find the number of clusters by finding the max value of target
63 let num_clusters = clustering_data.target.as_ref().map_or(0, |t| {
64 let mut max_val = -1.0;
65 for &val in t.iter() {
66 if val > max_val {
67 max_val = val;
68 }
69 }
70 (max_val as usize) + 1
71 });
72
73 println!(" Clusters: {}", num_clusters);
74
75 // Generate time series data
76 let time_series = make_time_series(
77 100, // 100 time steps
78 3, // 3 features/variables
79 true, // with trend
80 true, // with seasonality
81 0.2, // noise level
82 Some(42),
83 )?;
84
85 println!("\nTime series dataset:");
86 println!(" Time steps: {}", time_series.n_samples());
87 println!(" Features: {}", time_series.n_features());
88
89 Ok(())
90}
Sourcepub fn n_features(&self) -> usize
pub fn n_features(&self) -> usize
Examples found in repository?
3fn main() -> Result<(), Box<dyn std::error::Error>> {
4 let iris = load_iris()?;
5 println!("Iris dataset loaded:");
6 println!(" Samples: {}", iris.n_samples());
7 println!(" Features: {}", iris.n_features());
8 println!(
9 " Target classes: {}",
10 iris.target_names.as_ref().map_or(0, |v| v.len())
11 );
12
13 let boston = load_boston()?;
14 println!("\nBoston Housing dataset loaded:");
15 println!(" Samples: {}", boston.n_samples());
16 println!(" Features: {}", boston.n_features());
17
18 Ok(())
19}
More examples
3fn main() -> Result<(), Box<dyn std::error::Error>> {
4 // Load a CSV file with headers and target column
5 let dataset = load_csv(
6 "scirs2-datasets/data/example.csv",
7 true, // has header
8 Some(3), // target column index (0-based)
9 )?;
10
11 println!("CSV dataset loaded successfully:");
12 println!(" Samples: {}", dataset.n_samples());
13 println!(" Features: {}", dataset.n_features());
14 println!(" Feature names: {:?}", dataset.feature_names);
15
16 // Access data and target
17 println!("\nFirst 3 samples:");
18 for i in 0..3 {
19 let features = dataset.data.row(i);
20 let target = dataset.target.as_ref().map(|t| t[i]);
21 println!(
22 " Sample {}: Features = {:?}, Target = {:?}",
23 i, features, target
24 );
25 }
26
27 Ok(())
28}
61fn print_dataset_info(dataset: &Dataset, name: &str) {
62 println!("=== {} Dataset ===", name);
63 println!("Number of samples: {}", dataset.n_samples());
64 println!("Number of features: {}", dataset.n_features());
65
66 if let Some(feature_names) = &dataset.feature_names {
67 println!(
68 "Features: {:?}",
69 &feature_names[0..std::cmp::min(5, feature_names.len())]
70 );
71 if feature_names.len() > 5 {
72 println!("... and {} more", feature_names.len() - 5);
73 }
74 }
75
76 if let Some(target) = &dataset.target {
77 println!("Target shape: {}", target.len());
78
79 if let Some(target_names) = &dataset.target_names {
80 println!("Target classes: {:?}", target_names);
81 }
82 }
83
84 for (key, value) in &dataset.metadata {
85 println!("Metadata - {}: {}", key, value);
86 }
87}
182fn print_dataset_summary(dataset: &scirs2_datasets::Dataset, name: &str) {
183 let n_classes = if let Some(target) = &dataset.target {
184 let unique_labels: std::collections::HashSet<_> =
185 target.iter().map(|&x| x as i32).collect();
186 unique_labels.len()
187 } else {
188 0
189 };
190
191 let class_info = if n_classes > 0 {
192 format!(", {} classes", n_classes)
193 } else {
194 " (unsupervised)".to_string()
195 };
196
197 println!(
198 " {}: {} samples, {} features{}",
199 name,
200 dataset.n_samples(),
201 dataset.n_features(),
202 class_info
203 );
204
205 // Print first few data points for small datasets
206 if dataset.n_samples() <= 10 && dataset.n_features() <= 3 {
207 println!(" Sample points:");
208 for i in 0..dataset.n_samples().min(3) {
209 let point: Vec<f64> = (0..dataset.n_features())
210 .map(|j| dataset.data[[i, j]])
211 .collect();
212 println!(
213 " [{:.3}, {:.3}{}]",
214 point[0],
215 point[1],
216 if point.len() > 2 {
217 format!(", {:.3}", point[2])
218 } else {
219 "".to_string()
220 }
221 );
222 }
223 }
224}
191fn demonstrate_comprehensive_corruption() {
192 println!("Testing comprehensive dataset corruption:");
193
194 // Load a real dataset
195 let iris = load_iris().unwrap();
196 println!(
197 "Original Iris dataset: {} samples, {} features",
198 iris.n_samples(),
199 iris.n_features()
200 );
201
202 let original_stats = calculate_basic_stats(&iris.data);
203 println!(
204 "Original stats - Mean: {:.3}, Std: {:.3}",
205 original_stats.0, original_stats.1
206 );
207
208 // Create different levels of corruption
209 let corruption_levels = [
210 (0.05, 0.02, "Light corruption"),
211 (0.1, 0.05, "Moderate corruption"),
212 (0.2, 0.1, "Heavy corruption"),
213 (0.3, 0.15, "Severe corruption"),
214 ];
215
216 for (missing_rate, outlier_rate, description) in corruption_levels {
217 let corrupted = make_corrupted_dataset(
218 &iris,
219 missing_rate,
220 MissingPattern::MAR, // More realistic than MCAR
221 outlier_rate,
222 OutlierType::Point,
223 2.5,
224 Some(42),
225 )
226 .unwrap();
227
228 // Calculate how much data is usable
229 let total_elements = corrupted.data.len();
230 let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
231 let usable_percentage =
232 ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
233
234 println!("{}:", description);
235 println!(" Missing data: {:.1}%", missing_rate * 100.0);
236 println!(" Outliers: {:.1}%", outlier_rate * 100.0);
237 println!(" Usable data: {:.1}%", usable_percentage);
238
239 // Show metadata
240 if let Some(missing_count) = corrupted.metadata.get("missing_count") {
241 println!(" Actual missing: {} elements", missing_count);
242 }
243 if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
244 println!(" Actual outliers: {} samples", outlier_count);
245 }
246 }
247}
4fn main() -> Result<(), Box<dyn std::error::Error>> {
5 println!("Loading time series datasets...\n");
6
7 // Load the electrocardiogram dataset
8 let ecg = electrocardiogram()?;
9
10 println!("Electrocardiogram dataset:");
11 println!(" Time steps: {}", ecg.n_samples());
12 println!(" Features: {}", ecg.n_features());
13 println!(
14 " Sampling rate: {} Hz",
15 ecg.metadata
16 .get("sampling_rate")
17 .unwrap_or(&"unknown".to_string())
18 );
19 println!(
20 " Duration: {}",
21 ecg.metadata
22 .get("duration")
23 .unwrap_or(&"unknown".to_string())
24 );
25
26 // Get a slice of the data and display basic statistics
27 let ecg_slice = ecg.data.slice(s![0..10, 0]);
28 println!(" First 10 data points: {:?}", ecg_slice);
29
30 // Calculate some basic statistics
31 let ecg_data = ecg.data.column(0);
32 let min = ecg_data.fold(f64::INFINITY, |a, &b| a.min(b));
33 let max = ecg_data.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
34 let mean = ecg_data.sum() / ecg_data.len() as f64;
35
36 println!(" Min: {:.3} mV", min);
37 println!(" Max: {:.3} mV", max);
38 println!(" Mean: {:.3} mV", mean);
39
40 // Note: Stock market and weather datasets are commented out because their source data
41 // is not yet available.
42
43 /*
44 // Load the stock market dataset
45 println!("\nStock market dataset:");
46
47 // Get price changes (returns)
48 let stock_returns = stock_market(true)?;
49 println!(" Time steps: {}", stock_returns.n_samples());
50 println!(" Companies: {}", stock_returns.n_features());
51
52 // Print companies
53 if let Some(feature_names) = &stock_returns.feature_names {
54 println!(" Companies: {}", feature_names.join(", "));
55 }
56
57 // Load the weather dataset
58 println!("\nWeather dataset:");
59 let temp_data = weather(Some("temperature"))?;
60
61 println!(" Time steps: {}", temp_data.n_samples());
62 println!(" Locations: {}", temp_data.n_features());
63 */
64
65 println!("\nTime series dataset loaded successfully!");
66
67 Ok(())
68}
Sourcepub fn has_target(&self) -> bool
pub fn has_target(&self) -> bool
Sourcepub fn feature_names(&self) -> Option<&Vec<String>>
pub fn feature_names(&self) -> Option<&Vec<String>>
Get a reference to the feature names if available
§Returns
Optional reference to feature names vector
Sourcepub fn target_names(&self) -> Option<&Vec<String>>
pub fn target_names(&self) -> Option<&Vec<String>>
Sourcepub fn description(&self) -> Option<&String>
pub fn description(&self) -> Option<&String>
Get a reference to the dataset description if available
§Returns
Optional reference to dataset description
Sourcepub fn set_metadata(&mut self, key: &str, value: &str)
pub fn set_metadata(&mut self, key: &str, value: &str)
Trait Implementations§
Source§impl<'de> Deserialize<'de> for Dataset
impl<'de> Deserialize<'de> for Dataset
Source§fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
Auto Trait Implementations§
impl Freeze for Dataset
impl RefUnwindSafe for Dataset
impl Send for Dataset
impl Sync for Dataset
impl Unpin for Dataset
impl UnwindSafe for Dataset
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more