Struct Dataset

Source
pub struct Dataset {
    pub data: Array2<f64>,
    pub target: Option<Array1<f64>>,
    pub target_names: Option<Vec<String>>,
    pub feature_names: Option<Vec<String>>,
    pub feature_descriptions: Option<Vec<String>>,
    pub description: Option<String>,
    pub metadata: HashMap<String, String>,
}
Expand description

Represents a dataset with features, optional targets, and metadata

The Dataset struct is the core data structure for managing machine learning datasets. It stores the feature matrix, optional target values, and rich metadata including feature names, descriptions, and arbitrary key-value pairs.

§Examples

use ndarray::Array2;
use scirs2_datasets::utils::Dataset;

let data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let dataset = Dataset::new(data, None)
    .with_feature_names(vec!["feature1".to_string(), "feature2".to_string()])
    .with_description("Sample dataset".to_string());

assert_eq!(dataset.n_samples(), 3);
assert_eq!(dataset.n_features(), 2);

Fields§

§data: Array2<f64>

Features/data matrix (n_samples, n_features)

§target: Option<Array1<f64>>

Optional target values

§target_names: Option<Vec<String>>

Optional target names for classification problems

§feature_names: Option<Vec<String>>

Optional feature names

§feature_descriptions: Option<Vec<String>>

Optional descriptions for each feature

§description: Option<String>

Optional dataset description

§metadata: HashMap<String, String>

Optional dataset metadata

Implementations§

Source§

impl Dataset

Source

pub fn new(data: Array2<f64>, target: Option<Array1<f64>>) -> Self

Create a new dataset with the given data and target

§Arguments
  • data - The feature matrix (n_samples, n_features)
  • target - Optional target values (n_samples,)
§Returns

A new Dataset instance with empty metadata

§Examples
use ndarray::{Array1, Array2};
use scirs2_datasets::utils::Dataset;

let data = Array2::zeros((100, 5));
let target = Some(Array1::zeros(100));
let dataset = Dataset::new(data, target);
Examples found in repository?
examples/cross_validation_demo.rs (line 20)
9fn main() {
10    println!("=== Cross-Validation Demonstration ===\n");
11
12    // Create sample dataset
13    let data = Array2::from_shape_vec((20, 3), (0..60).map(|x| x as f64 / 10.0).collect()).unwrap();
14    let target = Array1::from(
15        (0..20)
16            .map(|i| if i % 2 == 0 { 0.0 } else { 1.0 })
17            .collect::<Vec<_>>(),
18    );
19
20    let dataset = Dataset::new(data.clone(), Some(target.clone()))
21        .with_description("Sample dataset for cross-validation demo".to_string());
22
23    println!("Dataset info:");
24    println!("- Samples: {}", dataset.n_samples());
25    println!("- Features: {}", dataset.n_features());
26    println!("- Description: {}\n", dataset.description.as_ref().unwrap());
27
28    // Demonstrate K-fold cross-validation
29    println!("=== K-Fold Cross-Validation (k=5) ===");
30    let k_folds = k_fold_split(dataset.n_samples(), 5, true, Some(42)).unwrap();
31
32    for (i, (train_indices, val_indices)) in k_folds.iter().enumerate() {
33        println!(
34            "Fold {}: Train size: {}, Validation size: {}",
35            i + 1,
36            train_indices.len(),
37            val_indices.len()
38        );
39        println!(
40            "  Train indices: {:?}",
41            &train_indices[..5.min(train_indices.len())]
42        );
43        println!("  Val indices: {:?}", val_indices);
44    }
45    println!();
46
47    // Demonstrate Stratified K-fold cross-validation
48    println!("=== Stratified K-Fold Cross-Validation (k=4) ===");
49    let stratified_folds = stratified_k_fold_split(&target, 4, true, Some(42)).unwrap();
50
51    for (i, (train_indices, val_indices)) in stratified_folds.iter().enumerate() {
52        // Calculate class distribution in validation set
53        let val_targets: Vec<f64> = val_indices.iter().map(|&idx| target[idx]).collect();
54        let class_0_count = val_targets.iter().filter(|&&x| x == 0.0).count();
55        let class_1_count = val_targets.iter().filter(|&&x| x == 1.0).count();
56
57        println!(
58            "Fold {}: Train size: {}, Validation size: {}",
59            i + 1,
60            train_indices.len(),
61            val_indices.len()
62        );
63        println!(
64            "  Class distribution in validation: Class 0: {}, Class 1: {}",
65            class_0_count, class_1_count
66        );
67    }
68    println!();
69
70    // Demonstrate Time Series cross-validation
71    println!("=== Time Series Cross-Validation ===");
72    let ts_folds = time_series_split(dataset.n_samples(), 3, 3, 1).unwrap();
73
74    for (i, (train_indices, val_indices)) in ts_folds.iter().enumerate() {
75        println!(
76            "Split {}: Train size: {}, Test size: {}",
77            i + 1,
78            train_indices.len(),
79            val_indices.len()
80        );
81        println!(
82            "  Train range: {} to {}",
83            train_indices.first().unwrap_or(&0),
84            train_indices.last().unwrap_or(&0)
85        );
86        println!(
87            "  Test range: {} to {}",
88            val_indices.first().unwrap_or(&0),
89            val_indices.last().unwrap_or(&0)
90        );
91    }
92    println!();
93
94    // Demonstrate usage with Dataset methods
95    println!("=== Using Cross-Validation with Dataset ===");
96    let first_fold = &k_folds[0];
97    let (train_indices, val_indices) = first_fold;
98
99    // Create training subset
100    let train_data = data.select(ndarray::Axis(0), train_indices);
101    let train_target = target.select(ndarray::Axis(0), train_indices);
102    let train_dataset = Dataset::new(train_data, Some(train_target))
103        .with_description("Training fold from K-fold CV".to_string());
104
105    // Create validation subset
106    let val_data = data.select(ndarray::Axis(0), val_indices);
107    let val_target = target.select(ndarray::Axis(0), val_indices);
108    let val_dataset = Dataset::new(val_data, Some(val_target))
109        .with_description("Validation fold from K-fold CV".to_string());
110
111    println!(
112        "Training dataset: {} samples, {} features",
113        train_dataset.n_samples(),
114        train_dataset.n_features()
115    );
116    println!(
117        "Validation dataset: {} samples, {} features",
118        val_dataset.n_samples(),
119        val_dataset.n_features()
120    );
121
122    println!("\n=== Cross-Validation Demo Complete ===");
123}
More examples
Hide additional examples
examples/sampling_demo.rs (line 45)
9fn main() {
10    println!("=== Sampling and Bootstrapping Demonstration ===\n");
11
12    // Load the Iris dataset for demonstration
13    let iris = load_iris().unwrap();
14    let n_samples = iris.n_samples();
15
16    println!("Original Iris dataset:");
17    println!("- Samples: {}", n_samples);
18    println!("- Features: {}", iris.n_features());
19
20    if let Some(target) = &iris.target {
21        let class_counts = count_classes(target);
22        println!("- Class distribution: {:?}\n", class_counts);
23    }
24
25    // Demonstrate random sampling without replacement
26    println!("=== Random Sampling (without replacement) ===");
27    let sample_size = 30;
28    let random_indices = random_sample(n_samples, sample_size, false, Some(42)).unwrap();
29
30    println!(
31        "Sampled {} indices from {} total samples",
32        sample_size, n_samples
33    );
34    println!(
35        "Sample indices: {:?}",
36        &random_indices[..10.min(random_indices.len())]
37    );
38
39    // Create a subset dataset
40    let sample_data = iris.data.select(ndarray::Axis(0), &random_indices);
41    let sample_target = iris
42        .target
43        .as_ref()
44        .map(|t| t.select(ndarray::Axis(0), &random_indices));
45    let sample_dataset = Dataset::new(sample_data, sample_target)
46        .with_description("Random sample from Iris dataset".to_string());
47
48    println!(
49        "Random sample dataset: {} samples, {} features",
50        sample_dataset.n_samples(),
51        sample_dataset.n_features()
52    );
53
54    if let Some(target) = &sample_dataset.target {
55        let sample_class_counts = count_classes(target);
56        println!("Sample class distribution: {:?}\n", sample_class_counts);
57    }
58
59    // Demonstrate bootstrap sampling (with replacement)
60    println!("=== Bootstrap Sampling (with replacement) ===");
61    let bootstrap_size = 200; // More than original dataset size
62    let bootstrap_indices = random_sample(n_samples, bootstrap_size, true, Some(42)).unwrap();
63
64    println!(
65        "Bootstrap sampled {} indices from {} total samples",
66        bootstrap_size, n_samples
67    );
68    println!(
69        "Bootstrap may have duplicates - first 10 indices: {:?}",
70        &bootstrap_indices[..10]
71    );
72
73    // Count frequency of each index in bootstrap sample
74    let mut index_counts = vec![0; n_samples];
75    for &idx in &bootstrap_indices {
76        index_counts[idx] += 1;
77    }
78    let max_count = *index_counts.iter().max().unwrap();
79    let zero_count = index_counts.iter().filter(|&&count| count == 0).count();
80
81    println!("Bootstrap statistics:");
82    println!("- Maximum frequency of any sample: {}", max_count);
83    println!(
84        "- Number of original samples not selected: {}\n",
85        zero_count
86    );
87
88    // Demonstrate stratified sampling
89    println!("=== Stratified Sampling ===");
90    if let Some(target) = &iris.target {
91        let stratified_size = 30;
92        let stratified_indices = stratified_sample(target, stratified_size, Some(42)).unwrap();
93
94        println!(
95            "Stratified sampled {} indices maintaining class proportions",
96            stratified_size
97        );
98
99        // Create stratified subset
100        let stratified_data = iris.data.select(ndarray::Axis(0), &stratified_indices);
101        let stratified_target = target.select(ndarray::Axis(0), &stratified_indices);
102        let stratified_dataset = Dataset::new(stratified_data, Some(stratified_target))
103            .with_description("Stratified sample from Iris dataset".to_string());
104
105        println!(
106            "Stratified sample dataset: {} samples, {} features",
107            stratified_dataset.n_samples(),
108            stratified_dataset.n_features()
109        );
110
111        let stratified_class_counts = count_classes(&stratified_dataset.target.unwrap());
112        println!(
113            "Stratified sample class distribution: {:?}",
114            stratified_class_counts
115        );
116
117        // Verify proportions are maintained
118        let original_proportions = calculate_proportions(&count_classes(target));
119        let stratified_proportions = calculate_proportions(&stratified_class_counts);
120
121        println!("Class proportion comparison:");
122        for (&class, &original_prop) in &original_proportions {
123            let stratified_prop = stratified_proportions.get(&class).unwrap_or(&0.0);
124            println!(
125                "  Class {}: Original {:.2}%, Stratified {:.2}%",
126                class,
127                original_prop * 100.0,
128                stratified_prop * 100.0
129            );
130        }
131    }
132
133    // Demonstrate practical use case: creating training/validation splits
134    println!("\n=== Practical Example: Multiple Train/Validation Splits ===");
135    for i in 1..=3 {
136        let split_indices = random_sample(n_samples, 100, false, Some(42 + i)).unwrap();
137        let (train_indices, val_indices) = split_indices.split_at(80);
138
139        println!(
140            "Split {}: {} training samples, {} validation samples",
141            i,
142            train_indices.len(),
143            val_indices.len()
144        );
145    }
146
147    println!("\n=== Sampling Demo Complete ===");
148}
Source

pub fn with_target_names(self, target_names: Vec<String>) -> Self

Add target names to the dataset (builder pattern)

§Arguments
  • target_names - Vector of target class names
§Returns

Self for method chaining

Source

pub fn with_feature_names(self, feature_names: Vec<String>) -> Self

Add feature names to the dataset (builder pattern)

§Arguments
  • feature_names - Vector of feature names
§Returns

Self for method chaining

Source

pub fn with_feature_descriptions( self, feature_descriptions: Vec<String>, ) -> Self

Add feature descriptions to the dataset (builder pattern)

§Arguments
  • feature_descriptions - Vector of feature descriptions
§Returns

Self for method chaining

Source

pub fn with_description(self, description: String) -> Self

Add a description to the dataset (builder pattern)

§Arguments
  • description - Dataset description
§Returns

Self for method chaining

Examples found in repository?
examples/cross_validation_demo.rs (line 21)
9fn main() {
10    println!("=== Cross-Validation Demonstration ===\n");
11
12    // Create sample dataset
13    let data = Array2::from_shape_vec((20, 3), (0..60).map(|x| x as f64 / 10.0).collect()).unwrap();
14    let target = Array1::from(
15        (0..20)
16            .map(|i| if i % 2 == 0 { 0.0 } else { 1.0 })
17            .collect::<Vec<_>>(),
18    );
19
20    let dataset = Dataset::new(data.clone(), Some(target.clone()))
21        .with_description("Sample dataset for cross-validation demo".to_string());
22
23    println!("Dataset info:");
24    println!("- Samples: {}", dataset.n_samples());
25    println!("- Features: {}", dataset.n_features());
26    println!("- Description: {}\n", dataset.description.as_ref().unwrap());
27
28    // Demonstrate K-fold cross-validation
29    println!("=== K-Fold Cross-Validation (k=5) ===");
30    let k_folds = k_fold_split(dataset.n_samples(), 5, true, Some(42)).unwrap();
31
32    for (i, (train_indices, val_indices)) in k_folds.iter().enumerate() {
33        println!(
34            "Fold {}: Train size: {}, Validation size: {}",
35            i + 1,
36            train_indices.len(),
37            val_indices.len()
38        );
39        println!(
40            "  Train indices: {:?}",
41            &train_indices[..5.min(train_indices.len())]
42        );
43        println!("  Val indices: {:?}", val_indices);
44    }
45    println!();
46
47    // Demonstrate Stratified K-fold cross-validation
48    println!("=== Stratified K-Fold Cross-Validation (k=4) ===");
49    let stratified_folds = stratified_k_fold_split(&target, 4, true, Some(42)).unwrap();
50
51    for (i, (train_indices, val_indices)) in stratified_folds.iter().enumerate() {
52        // Calculate class distribution in validation set
53        let val_targets: Vec<f64> = val_indices.iter().map(|&idx| target[idx]).collect();
54        let class_0_count = val_targets.iter().filter(|&&x| x == 0.0).count();
55        let class_1_count = val_targets.iter().filter(|&&x| x == 1.0).count();
56
57        println!(
58            "Fold {}: Train size: {}, Validation size: {}",
59            i + 1,
60            train_indices.len(),
61            val_indices.len()
62        );
63        println!(
64            "  Class distribution in validation: Class 0: {}, Class 1: {}",
65            class_0_count, class_1_count
66        );
67    }
68    println!();
69
70    // Demonstrate Time Series cross-validation
71    println!("=== Time Series Cross-Validation ===");
72    let ts_folds = time_series_split(dataset.n_samples(), 3, 3, 1).unwrap();
73
74    for (i, (train_indices, val_indices)) in ts_folds.iter().enumerate() {
75        println!(
76            "Split {}: Train size: {}, Test size: {}",
77            i + 1,
78            train_indices.len(),
79            val_indices.len()
80        );
81        println!(
82            "  Train range: {} to {}",
83            train_indices.first().unwrap_or(&0),
84            train_indices.last().unwrap_or(&0)
85        );
86        println!(
87            "  Test range: {} to {}",
88            val_indices.first().unwrap_or(&0),
89            val_indices.last().unwrap_or(&0)
90        );
91    }
92    println!();
93
94    // Demonstrate usage with Dataset methods
95    println!("=== Using Cross-Validation with Dataset ===");
96    let first_fold = &k_folds[0];
97    let (train_indices, val_indices) = first_fold;
98
99    // Create training subset
100    let train_data = data.select(ndarray::Axis(0), train_indices);
101    let train_target = target.select(ndarray::Axis(0), train_indices);
102    let train_dataset = Dataset::new(train_data, Some(train_target))
103        .with_description("Training fold from K-fold CV".to_string());
104
105    // Create validation subset
106    let val_data = data.select(ndarray::Axis(0), val_indices);
107    let val_target = target.select(ndarray::Axis(0), val_indices);
108    let val_dataset = Dataset::new(val_data, Some(val_target))
109        .with_description("Validation fold from K-fold CV".to_string());
110
111    println!(
112        "Training dataset: {} samples, {} features",
113        train_dataset.n_samples(),
114        train_dataset.n_features()
115    );
116    println!(
117        "Validation dataset: {} samples, {} features",
118        val_dataset.n_samples(),
119        val_dataset.n_features()
120    );
121
122    println!("\n=== Cross-Validation Demo Complete ===");
123}
More examples
Hide additional examples
examples/sampling_demo.rs (line 46)
9fn main() {
10    println!("=== Sampling and Bootstrapping Demonstration ===\n");
11
12    // Load the Iris dataset for demonstration
13    let iris = load_iris().unwrap();
14    let n_samples = iris.n_samples();
15
16    println!("Original Iris dataset:");
17    println!("- Samples: {}", n_samples);
18    println!("- Features: {}", iris.n_features());
19
20    if let Some(target) = &iris.target {
21        let class_counts = count_classes(target);
22        println!("- Class distribution: {:?}\n", class_counts);
23    }
24
25    // Demonstrate random sampling without replacement
26    println!("=== Random Sampling (without replacement) ===");
27    let sample_size = 30;
28    let random_indices = random_sample(n_samples, sample_size, false, Some(42)).unwrap();
29
30    println!(
31        "Sampled {} indices from {} total samples",
32        sample_size, n_samples
33    );
34    println!(
35        "Sample indices: {:?}",
36        &random_indices[..10.min(random_indices.len())]
37    );
38
39    // Create a subset dataset
40    let sample_data = iris.data.select(ndarray::Axis(0), &random_indices);
41    let sample_target = iris
42        .target
43        .as_ref()
44        .map(|t| t.select(ndarray::Axis(0), &random_indices));
45    let sample_dataset = Dataset::new(sample_data, sample_target)
46        .with_description("Random sample from Iris dataset".to_string());
47
48    println!(
49        "Random sample dataset: {} samples, {} features",
50        sample_dataset.n_samples(),
51        sample_dataset.n_features()
52    );
53
54    if let Some(target) = &sample_dataset.target {
55        let sample_class_counts = count_classes(target);
56        println!("Sample class distribution: {:?}\n", sample_class_counts);
57    }
58
59    // Demonstrate bootstrap sampling (with replacement)
60    println!("=== Bootstrap Sampling (with replacement) ===");
61    let bootstrap_size = 200; // More than original dataset size
62    let bootstrap_indices = random_sample(n_samples, bootstrap_size, true, Some(42)).unwrap();
63
64    println!(
65        "Bootstrap sampled {} indices from {} total samples",
66        bootstrap_size, n_samples
67    );
68    println!(
69        "Bootstrap may have duplicates - first 10 indices: {:?}",
70        &bootstrap_indices[..10]
71    );
72
73    // Count frequency of each index in bootstrap sample
74    let mut index_counts = vec![0; n_samples];
75    for &idx in &bootstrap_indices {
76        index_counts[idx] += 1;
77    }
78    let max_count = *index_counts.iter().max().unwrap();
79    let zero_count = index_counts.iter().filter(|&&count| count == 0).count();
80
81    println!("Bootstrap statistics:");
82    println!("- Maximum frequency of any sample: {}", max_count);
83    println!(
84        "- Number of original samples not selected: {}\n",
85        zero_count
86    );
87
88    // Demonstrate stratified sampling
89    println!("=== Stratified Sampling ===");
90    if let Some(target) = &iris.target {
91        let stratified_size = 30;
92        let stratified_indices = stratified_sample(target, stratified_size, Some(42)).unwrap();
93
94        println!(
95            "Stratified sampled {} indices maintaining class proportions",
96            stratified_size
97        );
98
99        // Create stratified subset
100        let stratified_data = iris.data.select(ndarray::Axis(0), &stratified_indices);
101        let stratified_target = target.select(ndarray::Axis(0), &stratified_indices);
102        let stratified_dataset = Dataset::new(stratified_data, Some(stratified_target))
103            .with_description("Stratified sample from Iris dataset".to_string());
104
105        println!(
106            "Stratified sample dataset: {} samples, {} features",
107            stratified_dataset.n_samples(),
108            stratified_dataset.n_features()
109        );
110
111        let stratified_class_counts = count_classes(&stratified_dataset.target.unwrap());
112        println!(
113            "Stratified sample class distribution: {:?}",
114            stratified_class_counts
115        );
116
117        // Verify proportions are maintained
118        let original_proportions = calculate_proportions(&count_classes(target));
119        let stratified_proportions = calculate_proportions(&stratified_class_counts);
120
121        println!("Class proportion comparison:");
122        for (&class, &original_prop) in &original_proportions {
123            let stratified_prop = stratified_proportions.get(&class).unwrap_or(&0.0);
124            println!(
125                "  Class {}: Original {:.2}%, Stratified {:.2}%",
126                class,
127                original_prop * 100.0,
128                stratified_prop * 100.0
129            );
130        }
131    }
132
133    // Demonstrate practical use case: creating training/validation splits
134    println!("\n=== Practical Example: Multiple Train/Validation Splits ===");
135    for i in 1..=3 {
136        let split_indices = random_sample(n_samples, 100, false, Some(42 + i)).unwrap();
137        let (train_indices, val_indices) = split_indices.split_at(80);
138
139        println!(
140            "Split {}: {} training samples, {} validation samples",
141            i,
142            train_indices.len(),
143            val_indices.len()
144        );
145    }
146
147    println!("\n=== Sampling Demo Complete ===");
148}
Source

pub fn with_metadata(self, key: &str, value: &str) -> Self

Add metadata to the dataset (builder pattern)

§Arguments
  • key - Metadata key
  • value - Metadata value
§Returns

Self for method chaining

Source

pub fn n_samples(&self) -> usize

Get the number of samples in the dataset

§Returns

Number of samples (rows) in the dataset

Examples found in repository?
examples/toy_datasets.rs (line 6)
3fn main() -> Result<(), Box<dyn std::error::Error>> {
4    let iris = load_iris()?;
5    println!("Iris dataset loaded:");
6    println!("  Samples: {}", iris.n_samples());
7    println!("  Features: {}", iris.n_features());
8    println!(
9        "  Target classes: {}",
10        iris.target_names.as_ref().map_or(0, |v| v.len())
11    );
12
13    let boston = load_boston()?;
14    println!("\nBoston Housing dataset loaded:");
15    println!("  Samples: {}", boston.n_samples());
16    println!("  Features: {}", boston.n_features());
17
18    Ok(())
19}
More examples
Hide additional examples
examples/csv_loading.rs (line 12)
3fn main() -> Result<(), Box<dyn std::error::Error>> {
4    // Load a CSV file with headers and target column
5    let dataset = load_csv(
6        "scirs2-datasets/data/example.csv",
7        true,    // has header
8        Some(3), // target column index (0-based)
9    )?;
10
11    println!("CSV dataset loaded successfully:");
12    println!("  Samples: {}", dataset.n_samples());
13    println!("  Features: {}", dataset.n_features());
14    println!("  Feature names: {:?}", dataset.feature_names);
15
16    // Access data and target
17    println!("\nFirst 3 samples:");
18    for i in 0..3 {
19        let features = dataset.data.row(i);
20        let target = dataset.target.as_ref().map(|t| t[i]);
21        println!(
22            "  Sample {}: Features = {:?}, Target = {:?}",
23            i, features, target
24        );
25    }
26
27    Ok(())
28}
examples/noise_models_demo.rs (line 198)
191fn demonstrate_comprehensive_corruption() {
192    println!("Testing comprehensive dataset corruption:");
193
194    // Load a real dataset
195    let iris = load_iris().unwrap();
196    println!(
197        "Original Iris dataset: {} samples, {} features",
198        iris.n_samples(),
199        iris.n_features()
200    );
201
202    let original_stats = calculate_basic_stats(&iris.data);
203    println!(
204        "Original stats - Mean: {:.3}, Std: {:.3}",
205        original_stats.0, original_stats.1
206    );
207
208    // Create different levels of corruption
209    let corruption_levels = [
210        (0.05, 0.02, "Light corruption"),
211        (0.1, 0.05, "Moderate corruption"),
212        (0.2, 0.1, "Heavy corruption"),
213        (0.3, 0.15, "Severe corruption"),
214    ];
215
216    for (missing_rate, outlier_rate, description) in corruption_levels {
217        let corrupted = make_corrupted_dataset(
218            &iris,
219            missing_rate,
220            MissingPattern::MAR, // More realistic than MCAR
221            outlier_rate,
222            OutlierType::Point,
223            2.5,
224            Some(42),
225        )
226        .unwrap();
227
228        // Calculate how much data is usable
229        let total_elements = corrupted.data.len();
230        let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
231        let usable_percentage =
232            ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
233
234        println!("{}:", description);
235        println!("  Missing data: {:.1}%", missing_rate * 100.0);
236        println!("  Outliers: {:.1}%", outlier_rate * 100.0);
237        println!("  Usable data: {:.1}%", usable_percentage);
238
239        // Show metadata
240        if let Some(missing_count) = corrupted.metadata.get("missing_count") {
241            println!("  Actual missing: {} elements", missing_count);
242        }
243        if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
244            println!("  Actual outliers: {} samples", outlier_count);
245        }
246    }
247}
examples/time_series_datasets.rs (line 11)
4fn main() -> Result<(), Box<dyn std::error::Error>> {
5    println!("Loading time series datasets...\n");
6
7    // Load the electrocardiogram dataset
8    let ecg = electrocardiogram()?;
9
10    println!("Electrocardiogram dataset:");
11    println!("  Time steps: {}", ecg.n_samples());
12    println!("  Features: {}", ecg.n_features());
13    println!(
14        "  Sampling rate: {} Hz",
15        ecg.metadata
16            .get("sampling_rate")
17            .unwrap_or(&"unknown".to_string())
18    );
19    println!(
20        "  Duration: {}",
21        ecg.metadata
22            .get("duration")
23            .unwrap_or(&"unknown".to_string())
24    );
25
26    // Get a slice of the data and display basic statistics
27    let ecg_slice = ecg.data.slice(s![0..10, 0]);
28    println!("  First 10 data points: {:?}", ecg_slice);
29
30    // Calculate some basic statistics
31    let ecg_data = ecg.data.column(0);
32    let min = ecg_data.fold(f64::INFINITY, |a, &b| a.min(b));
33    let max = ecg_data.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
34    let mean = ecg_data.sum() / ecg_data.len() as f64;
35
36    println!("  Min: {:.3} mV", min);
37    println!("  Max: {:.3} mV", max);
38    println!("  Mean: {:.3} mV", mean);
39
40    // Note: Stock market and weather datasets are commented out because their source data
41    // is not yet available.
42
43    /*
44    // Load the stock market dataset
45    println!("\nStock market dataset:");
46
47    // Get price changes (returns)
48    let stock_returns = stock_market(true)?;
49    println!("  Time steps: {}", stock_returns.n_samples());
50    println!("  Companies: {}", stock_returns.n_features());
51
52    // Print companies
53    if let Some(feature_names) = &stock_returns.feature_names {
54        println!("  Companies: {}", feature_names.join(", "));
55    }
56
57    // Load the weather dataset
58    println!("\nWeather dataset:");
59    let temp_data = weather(Some("temperature"))?;
60
61    println!("  Time steps: {}", temp_data.n_samples());
62    println!("  Locations: {}", temp_data.n_features());
63    */
64
65    println!("\nTime series dataset loaded successfully!");
66
67    Ok(())
68}
examples/dataset_loaders.rs (line 33)
6fn main() {
7    // Check if a CSV file is provided as a command-line argument
8    let args: Vec<String> = env::args().collect();
9    if args.len() < 2 {
10        println!("Usage: {} <path_to_csv_file>", args[0]);
11        println!("Example: {} examples/sample_data.csv", args[0]);
12        return;
13    }
14
15    let file_path = &args[1];
16
17    // Verify the file exists
18    if !Path::new(file_path).exists() {
19        println!("Error: File '{}' does not exist", file_path);
20        return;
21    }
22
23    // Load CSV file
24    println!("Loading CSV file: {}", file_path);
25    match loaders::load_csv(file_path, true, None) {
26        Ok(dataset) => {
27            print_dataset_info(&dataset, "Loaded CSV");
28
29            // Split the dataset for demonstration
30            println!("\nDemonstrating train-test split...");
31            match train_test_split(&dataset, 0.2, Some(42)) {
32                Ok((train, test)) => {
33                    println!("Training set: {} samples", train.n_samples());
34                    println!("Test set: {} samples", test.n_samples());
35
36                    // Save as JSON for demonstration
37                    let json_path = format!("{}.json", file_path);
38                    println!("\nSaving training dataset to JSON: {}", json_path);
39                    if let Err(e) = loaders::save_json(&train, &json_path) {
40                        println!("Error saving JSON: {}", e);
41                    } else {
42                        println!("Successfully saved JSON file");
43
44                        // Load back the JSON file
45                        println!("\nLoading back from JSON file...");
46                        match loaders::load_json(&json_path) {
47                            Ok(loaded) => {
48                                print_dataset_info(&loaded, "Loaded JSON");
49                            }
50                            Err(e) => println!("Error loading JSON: {}", e),
51                        }
52                    }
53                }
54                Err(e) => println!("Error splitting dataset: {}", e),
55            }
56        }
57        Err(e) => println!("Error loading CSV: {}", e),
58    }
59}
60
61fn print_dataset_info(dataset: &Dataset, name: &str) {
62    println!("=== {} Dataset ===", name);
63    println!("Number of samples: {}", dataset.n_samples());
64    println!("Number of features: {}", dataset.n_features());
65
66    if let Some(feature_names) = &dataset.feature_names {
67        println!(
68            "Features: {:?}",
69            &feature_names[0..std::cmp::min(5, feature_names.len())]
70        );
71        if feature_names.len() > 5 {
72            println!("... and {} more", feature_names.len() - 5);
73        }
74    }
75
76    if let Some(target) = &dataset.target {
77        println!("Target shape: {}", target.len());
78
79        if let Some(target_names) = &dataset.target_names {
80            println!("Target classes: {:?}", target_names);
81        }
82    }
83
84    for (key, value) in &dataset.metadata {
85        println!("Metadata - {}: {}", key, value);
86    }
87}
examples/data_generators.rs (line 26)
6fn main() -> Result<(), Box<dyn std::error::Error>> {
7    println!("Creating synthetic datasets...\n");
8
9    // Generate classification dataset
10    let n_samples = 100;
11    let n_features = 5;
12
13    let classification_data = make_classification(
14        n_samples,
15        n_features,
16        3,        // 3 classes
17        2,        // 2 clusters per class
18        3,        // 3 informative features
19        Some(42), // random seed
20    )?;
21
22    // Train-test split
23    let (train, test) = train_test_split(&classification_data, 0.2, Some(42))?;
24
25    println!("Classification dataset:");
26    println!("  Total samples: {}", classification_data.n_samples());
27    println!("  Features: {}", classification_data.n_features());
28    println!("  Training samples: {}", train.n_samples());
29    println!("  Test samples: {}", test.n_samples());
30
31    // Generate regression dataset
32    let regression_data = make_regression(
33        n_samples,
34        n_features,
35        3,   // 3 informative features
36        0.5, // noise level
37        Some(42),
38    )?;
39
40    println!("\nRegression dataset:");
41    println!("  Samples: {}", regression_data.n_samples());
42    println!("  Features: {}", regression_data.n_features());
43
44    // Normalize the data (in-place)
45    let mut data_copy = regression_data.data.clone();
46    normalize(&mut data_copy);
47    println!("  Data normalized successfully");
48
49    // Generate clustering data (blobs)
50    let clustering_data = make_blobs(
51        n_samples,
52        2,   // 2 features for easy visualization
53        4,   // 4 clusters
54        0.8, // cluster standard deviation
55        Some(42),
56    )?;
57
58    println!("\nClustering dataset (blobs):");
59    println!("  Samples: {}", clustering_data.n_samples());
60    println!("  Features: {}", clustering_data.n_features());
61
62    // Find the number of clusters by finding the max value of target
63    let num_clusters = clustering_data.target.as_ref().map_or(0, |t| {
64        let mut max_val = -1.0;
65        for &val in t.iter() {
66            if val > max_val {
67                max_val = val;
68            }
69        }
70        (max_val as usize) + 1
71    });
72
73    println!("  Clusters: {}", num_clusters);
74
75    // Generate time series data
76    let time_series = make_time_series(
77        100,  // 100 time steps
78        3,    // 3 features/variables
79        true, // with trend
80        true, // with seasonality
81        0.2,  // noise level
82        Some(42),
83    )?;
84
85    println!("\nTime series dataset:");
86    println!("  Time steps: {}", time_series.n_samples());
87    println!("  Features: {}", time_series.n_features());
88
89    Ok(())
90}
Source

pub fn n_features(&self) -> usize

Get the number of features in the dataset

§Returns

Number of features (columns) in the dataset

Examples found in repository?
examples/toy_datasets.rs (line 7)
3fn main() -> Result<(), Box<dyn std::error::Error>> {
4    let iris = load_iris()?;
5    println!("Iris dataset loaded:");
6    println!("  Samples: {}", iris.n_samples());
7    println!("  Features: {}", iris.n_features());
8    println!(
9        "  Target classes: {}",
10        iris.target_names.as_ref().map_or(0, |v| v.len())
11    );
12
13    let boston = load_boston()?;
14    println!("\nBoston Housing dataset loaded:");
15    println!("  Samples: {}", boston.n_samples());
16    println!("  Features: {}", boston.n_features());
17
18    Ok(())
19}
More examples
Hide additional examples
examples/csv_loading.rs (line 13)
3fn main() -> Result<(), Box<dyn std::error::Error>> {
4    // Load a CSV file with headers and target column
5    let dataset = load_csv(
6        "scirs2-datasets/data/example.csv",
7        true,    // has header
8        Some(3), // target column index (0-based)
9    )?;
10
11    println!("CSV dataset loaded successfully:");
12    println!("  Samples: {}", dataset.n_samples());
13    println!("  Features: {}", dataset.n_features());
14    println!("  Feature names: {:?}", dataset.feature_names);
15
16    // Access data and target
17    println!("\nFirst 3 samples:");
18    for i in 0..3 {
19        let features = dataset.data.row(i);
20        let target = dataset.target.as_ref().map(|t| t[i]);
21        println!(
22            "  Sample {}: Features = {:?}, Target = {:?}",
23            i, features, target
24        );
25    }
26
27    Ok(())
28}
examples/dataset_loaders.rs (line 64)
61fn print_dataset_info(dataset: &Dataset, name: &str) {
62    println!("=== {} Dataset ===", name);
63    println!("Number of samples: {}", dataset.n_samples());
64    println!("Number of features: {}", dataset.n_features());
65
66    if let Some(feature_names) = &dataset.feature_names {
67        println!(
68            "Features: {:?}",
69            &feature_names[0..std::cmp::min(5, feature_names.len())]
70        );
71        if feature_names.len() > 5 {
72            println!("... and {} more", feature_names.len() - 5);
73        }
74    }
75
76    if let Some(target) = &dataset.target {
77        println!("Target shape: {}", target.len());
78
79        if let Some(target_names) = &dataset.target_names {
80            println!("Target classes: {:?}", target_names);
81        }
82    }
83
84    for (key, value) in &dataset.metadata {
85        println!("Metadata - {}: {}", key, value);
86    }
87}
examples/complex_patterns_demo.rs (line 201)
182fn print_dataset_summary(dataset: &scirs2_datasets::Dataset, name: &str) {
183    let n_classes = if let Some(target) = &dataset.target {
184        let unique_labels: std::collections::HashSet<_> =
185            target.iter().map(|&x| x as i32).collect();
186        unique_labels.len()
187    } else {
188        0
189    };
190
191    let class_info = if n_classes > 0 {
192        format!(", {} classes", n_classes)
193    } else {
194        " (unsupervised)".to_string()
195    };
196
197    println!(
198        "   {}: {} samples, {} features{}",
199        name,
200        dataset.n_samples(),
201        dataset.n_features(),
202        class_info
203    );
204
205    // Print first few data points for small datasets
206    if dataset.n_samples() <= 10 && dataset.n_features() <= 3 {
207        println!("   Sample points:");
208        for i in 0..dataset.n_samples().min(3) {
209            let point: Vec<f64> = (0..dataset.n_features())
210                .map(|j| dataset.data[[i, j]])
211                .collect();
212            println!(
213                "     [{:.3}, {:.3}{}]",
214                point[0],
215                point[1],
216                if point.len() > 2 {
217                    format!(", {:.3}", point[2])
218                } else {
219                    "".to_string()
220                }
221            );
222        }
223    }
224}
examples/noise_models_demo.rs (line 199)
191fn demonstrate_comprehensive_corruption() {
192    println!("Testing comprehensive dataset corruption:");
193
194    // Load a real dataset
195    let iris = load_iris().unwrap();
196    println!(
197        "Original Iris dataset: {} samples, {} features",
198        iris.n_samples(),
199        iris.n_features()
200    );
201
202    let original_stats = calculate_basic_stats(&iris.data);
203    println!(
204        "Original stats - Mean: {:.3}, Std: {:.3}",
205        original_stats.0, original_stats.1
206    );
207
208    // Create different levels of corruption
209    let corruption_levels = [
210        (0.05, 0.02, "Light corruption"),
211        (0.1, 0.05, "Moderate corruption"),
212        (0.2, 0.1, "Heavy corruption"),
213        (0.3, 0.15, "Severe corruption"),
214    ];
215
216    for (missing_rate, outlier_rate, description) in corruption_levels {
217        let corrupted = make_corrupted_dataset(
218            &iris,
219            missing_rate,
220            MissingPattern::MAR, // More realistic than MCAR
221            outlier_rate,
222            OutlierType::Point,
223            2.5,
224            Some(42),
225        )
226        .unwrap();
227
228        // Calculate how much data is usable
229        let total_elements = corrupted.data.len();
230        let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
231        let usable_percentage =
232            ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
233
234        println!("{}:", description);
235        println!("  Missing data: {:.1}%", missing_rate * 100.0);
236        println!("  Outliers: {:.1}%", outlier_rate * 100.0);
237        println!("  Usable data: {:.1}%", usable_percentage);
238
239        // Show metadata
240        if let Some(missing_count) = corrupted.metadata.get("missing_count") {
241            println!("  Actual missing: {} elements", missing_count);
242        }
243        if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
244            println!("  Actual outliers: {} samples", outlier_count);
245        }
246    }
247}
examples/time_series_datasets.rs (line 12)
4fn main() -> Result<(), Box<dyn std::error::Error>> {
5    println!("Loading time series datasets...\n");
6
7    // Load the electrocardiogram dataset
8    let ecg = electrocardiogram()?;
9
10    println!("Electrocardiogram dataset:");
11    println!("  Time steps: {}", ecg.n_samples());
12    println!("  Features: {}", ecg.n_features());
13    println!(
14        "  Sampling rate: {} Hz",
15        ecg.metadata
16            .get("sampling_rate")
17            .unwrap_or(&"unknown".to_string())
18    );
19    println!(
20        "  Duration: {}",
21        ecg.metadata
22            .get("duration")
23            .unwrap_or(&"unknown".to_string())
24    );
25
26    // Get a slice of the data and display basic statistics
27    let ecg_slice = ecg.data.slice(s![0..10, 0]);
28    println!("  First 10 data points: {:?}", ecg_slice);
29
30    // Calculate some basic statistics
31    let ecg_data = ecg.data.column(0);
32    let min = ecg_data.fold(f64::INFINITY, |a, &b| a.min(b));
33    let max = ecg_data.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
34    let mean = ecg_data.sum() / ecg_data.len() as f64;
35
36    println!("  Min: {:.3} mV", min);
37    println!("  Max: {:.3} mV", max);
38    println!("  Mean: {:.3} mV", mean);
39
40    // Note: Stock market and weather datasets are commented out because their source data
41    // is not yet available.
42
43    /*
44    // Load the stock market dataset
45    println!("\nStock market dataset:");
46
47    // Get price changes (returns)
48    let stock_returns = stock_market(true)?;
49    println!("  Time steps: {}", stock_returns.n_samples());
50    println!("  Companies: {}", stock_returns.n_features());
51
52    // Print companies
53    if let Some(feature_names) = &stock_returns.feature_names {
54        println!("  Companies: {}", feature_names.join(", "));
55    }
56
57    // Load the weather dataset
58    println!("\nWeather dataset:");
59    let temp_data = weather(Some("temperature"))?;
60
61    println!("  Time steps: {}", temp_data.n_samples());
62    println!("  Locations: {}", temp_data.n_features());
63    */
64
65    println!("\nTime series dataset loaded successfully!");
66
67    Ok(())
68}
Source

pub fn shape(&self) -> (usize, usize)

Get dataset shape as (n_samples, n_features)

§Returns

Tuple of (n_samples, n_features)

Source

pub fn has_target(&self) -> bool

Check if the dataset has target values

§Returns

True if target values are present, false otherwise

Source

pub fn feature_names(&self) -> Option<&Vec<String>>

Get a reference to the feature names if available

§Returns

Optional reference to feature names vector

Source

pub fn target_names(&self) -> Option<&Vec<String>>

Get a reference to the target names if available

§Returns

Optional reference to target names vector

Source

pub fn description(&self) -> Option<&String>

Get a reference to the dataset description if available

§Returns

Optional reference to dataset description

Source

pub fn metadata(&self) -> &HashMap<String, String>

Get a reference to the metadata

§Returns

Reference to metadata HashMap

Source

pub fn set_metadata(&mut self, key: &str, value: &str)

Add or update a metadata entry

§Arguments
  • key - Metadata key
  • value - Metadata value
Source

pub fn get_metadata(&self, key: &str) -> Option<&String>

Get a metadata value by key

§Arguments
  • key - Metadata key to lookup
§Returns

Optional reference to the metadata value

Trait Implementations§

Source§

impl Clone for Dataset

Source§

fn clone(&self) -> Dataset

Returns a duplicate of the value. Read more
1.0.0 · Source§

const fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl Debug for Dataset

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl<'de> Deserialize<'de> for Dataset

Source§

fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>
where __D: Deserializer<'de>,

Deserialize this value from the given Serde deserializer. Read more
Source§

impl Serialize for Dataset

Source§

fn serialize<__S>(&self, __serializer: __S) -> Result<__S::Ok, __S::Error>
where __S: Serializer,

Serialize this value into the given Serde serializer. Read more

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> Pointable for T

Source§

const ALIGN: usize

The alignment of pointer.
Source§

type Init = T

The type for initializers.
Source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
Source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
Source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
Source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V

Source§

impl<T> DeserializeOwned for T
where T: for<'de> Deserialize<'de>,

Source§

impl<T> ErasedDestructor for T
where T: 'static,