Struct Dataset

Source

pub struct Dataset {
    pub data: Array2<f64>,
    pub target: Option<Array1<f64>>,
    pub target_names: Option<Vec<String>>,
    pub feature_names: Option<Vec<String>>,
    pub feature_descriptions: Option<Vec<String>>,
    pub description: Option<String>,
    pub metadata: HashMap<String, String>,
}

Expand description

Represents a dataset with features, optional targets, and metadata

Fields§

§data: Array2<f64>

Features/data matrix (n_samples, n_features)

§target: Option<Array1<f64>>

Optional target values

§target_names: Option<Vec<String>>

Optional target names for classification problems

§feature_names: Option<Vec<String>>

Optional feature names

§feature_descriptions: Option<Vec<String>>

Optional descriptions for each feature

§description: Option<String>

Optional dataset description

§metadata: HashMap<String, String>

Optional dataset metadata

Implementations§

Source §

impl Dataset

Source

pub fn new(data: Array2<f64>, target: Option<Array1<f64>>) -> Self

Create a new dataset with the given data and target

Source

pub fn with_target_names(self, target_names: Vec<String>) -> Self

Add target names to the dataset

Source

pub fn with_feature_names(self, feature_names: Vec<String>) -> Self

Add feature names to the dataset

Source

pub fn with_feature_descriptions( self, feature_descriptions: Vec<String>, ) -> Self

Add feature descriptions to the dataset

Source

pub fn with_description(self, description: String) -> Self

Add a description to the dataset

Source

pub fn with_metadata(self, key: &str, value: &str) -> Self

Add metadata to the dataset

Source

pub fn n_samples(&self) -> usize

Get the number of samples in the dataset

Examples found in repository ?

examples/toy_datasets.rs (line 6)

3fn main() -> Result<(), Box<dyn std::error::Error>> {
4    let iris = load_iris()?;
5    println!("Iris dataset loaded:");
6    println!("  Samples: {}", iris.n_samples());
7    println!("  Features: {}", iris.n_features());
8    println!(
9        "  Target classes: {}",
10        iris.target_names.as_ref().map_or(0, |v| v.len())
11    );
12
13    let boston = load_boston()?;
14    println!("\nBoston Housing dataset loaded:");
15    println!("  Samples: {}", boston.n_samples());
16    println!("  Features: {}", boston.n_features());
17
18    Ok(())
19}

More examples

Hide additional examples

examples/csv_loading.rs (line 12)

3fn main() -> Result<(), Box<dyn std::error::Error>> {
4    // Load a CSV file with headers and target column
5    let dataset = load_csv(
6        "scirs2-datasets/data/example.csv",
7        true,    // has header
8        Some(3), // target column index (0-based)
9    )?;
10
11    println!("CSV dataset loaded successfully:");
12    println!("  Samples: {}", dataset.n_samples());
13    println!("  Features: {}", dataset.n_features());
14    println!("  Feature names: {:?}", dataset.feature_names);
15
16    // Access data and target
17    println!("\nFirst 3 samples:");
18    for i in 0..3 {
19        let features = dataset.data.row(i);
20        let target = dataset.target.as_ref().map(|t| t[i]);
21        println!(
22            "  Sample {}: Features = {:?}, Target = {:?}",
23            i, features, target
24        );
25    }
26
27    Ok(())
28}

examples/dataset_loaders.rs (line 33)

6fn main() {
7    // Check if a CSV file is provided as a command-line argument
8    let args: Vec<String> = env::args().collect();
9    if args.len() < 2 {
10        println!("Usage: {} <path_to_csv_file>", args[0]);
11        println!("Example: {} examples/sample_data.csv", args[0]);
12        return;
13    }
14
15    let file_path = &args[1];
16
17    // Verify the file exists
18    if !Path::new(file_path).exists() {
19        println!("Error: File '{}' does not exist", file_path);
20        return;
21    }
22
23    // Load CSV file
24    println!("Loading CSV file: {}", file_path);
25    match loaders::load_csv(file_path, true, None) {
26        Ok(dataset) => {
27            print_dataset_info(&dataset, "Loaded CSV");
28
29            // Split the dataset for demonstration
30            println!("\nDemonstrating train-test split...");
31            match dataset.train_test_split(0.2, Some(42)) {
32                Ok((train, test)) => {
33                    println!("Training set: {} samples", train.n_samples());
34                    println!("Test set: {} samples", test.n_samples());
35
36                    // Save as JSON for demonstration
37                    let json_path = format!("{}.json", file_path);
38                    println!("\nSaving training dataset to JSON: {}", json_path);
39                    if let Err(e) = loaders::save_json(&train, &json_path) {
40                        println!("Error saving JSON: {}", e);
41                    } else {
42                        println!("Successfully saved JSON file");
43
44                        // Load back the JSON file
45                        println!("\nLoading back from JSON file...");
46                        match loaders::load_json(&json_path) {
47                            Ok(loaded) => {
48                                print_dataset_info(&loaded, "Loaded JSON");
49                            }
50                            Err(e) => println!("Error loading JSON: {}", e),
51                        }
52                    }
53                }
54                Err(e) => println!("Error splitting dataset: {}", e),
55            }
56        }
57        Err(e) => println!("Error loading CSV: {}", e),
58    }
59}
60
61fn print_dataset_info(dataset: &Dataset, name: &str) {
62    println!("=== {} Dataset ===", name);
63    println!("Number of samples: {}", dataset.n_samples());
64    println!("Number of features: {}", dataset.n_features());
65
66    if let Some(feature_names) = &dataset.feature_names {
67        println!(
68            "Features: {:?}",
69            &feature_names[0..std::cmp::min(5, feature_names.len())]
70        );
71        if feature_names.len() > 5 {
72            println!("... and {} more", feature_names.len() - 5);
73        }
74    }
75
76    if let Some(target) = &dataset.target {
77        println!("Target shape: {}", target.len());
78
79        if let Some(target_names) = &dataset.target_names {
80            println!("Target classes: {:?}", target_names);
81        }
82    }
83
84    for (key, value) in &dataset.metadata {
85        println!("Metadata - {}: {}", key, value);
86    }
87}

examples/data_generators.rs (line 25)

5fn main() -> Result<(), Box<dyn std::error::Error>> {
6    println!("Creating synthetic datasets...\n");
7
8    // Generate classification dataset
9    let n_samples = 100;
10    let n_features = 5;
11
12    let classification_data = make_classification(
13        n_samples,
14        n_features,
15        3,        // 3 classes
16        2,        // 2 clusters per class
17        3,        // 3 informative features
18        Some(42), // random seed
19    )?;
20
21    // Train-test split
22    let (train, test) = classification_data.train_test_split(0.2, Some(42))?;
23
24    println!("Classification dataset:");
25    println!("  Total samples: {}", classification_data.n_samples());
26    println!("  Features: {}", classification_data.n_features());
27    println!("  Training samples: {}", train.n_samples());
28    println!("  Test samples: {}", test.n_samples());
29
30    // Generate regression dataset
31    let regression_data = make_regression(
32        n_samples,
33        n_features,
34        3,   // 3 informative features
35        0.5, // noise level
36        Some(42),
37    )?;
38
39    println!("\nRegression dataset:");
40    println!("  Samples: {}", regression_data.n_samples());
41    println!("  Features: {}", regression_data.n_features());
42
43    // Normalize the data (in-place)
44    let mut data_copy = regression_data.data.clone();
45    normalize(&mut data_copy);
46    println!("  Data normalized successfully");
47
48    // Generate clustering data (blobs)
49    let clustering_data = make_blobs(
50        n_samples,
51        2,   // 2 features for easy visualization
52        4,   // 4 clusters
53        0.8, // cluster standard deviation
54        Some(42),
55    )?;
56
57    println!("\nClustering dataset (blobs):");
58    println!("  Samples: {}", clustering_data.n_samples());
59    println!("  Features: {}", clustering_data.n_features());
60
61    // Find the number of clusters by finding the max value of target
62    let num_clusters = clustering_data.target.as_ref().map_or(0, |t| {
63        let mut max_val = -1.0;
64        for &val in t.iter() {
65            if val > max_val {
66                max_val = val;
67            }
68        }
69        (max_val as usize) + 1
70    });
71
72    println!("  Clusters: {}", num_clusters);
73
74    // Generate time series data
75    let time_series = make_time_series(
76        100,  // 100 time steps
77        3,    // 3 features/variables
78        true, // with trend
79        true, // with seasonality
80        0.2,  // noise level
81        Some(42),
82    )?;
83
84    println!("\nTime series dataset:");
85    println!("  Time steps: {}", time_series.n_samples());
86    println!("  Features: {}", time_series.n_features());
87
88    Ok(())
89}

Source

pub fn n_features(&self) -> usize

Get the number of features in the dataset

Examples found in repository ?

examples/toy_datasets.rs (line 7)

3fn main() -> Result<(), Box<dyn std::error::Error>> {
4    let iris = load_iris()?;
5    println!("Iris dataset loaded:");
6    println!("  Samples: {}", iris.n_samples());
7    println!("  Features: {}", iris.n_features());
8    println!(
9        "  Target classes: {}",
10        iris.target_names.as_ref().map_or(0, |v| v.len())
11    );
12
13    let boston = load_boston()?;
14    println!("\nBoston Housing dataset loaded:");
15    println!("  Samples: {}", boston.n_samples());
16    println!("  Features: {}", boston.n_features());
17
18    Ok(())
19}

More examples

Hide additional examples

examples/csv_loading.rs (line 13)

3fn main() -> Result<(), Box<dyn std::error::Error>> {
4    // Load a CSV file with headers and target column
5    let dataset = load_csv(
6        "scirs2-datasets/data/example.csv",
7        true,    // has header
8        Some(3), // target column index (0-based)
9    )?;
10
11    println!("CSV dataset loaded successfully:");
12    println!("  Samples: {}", dataset.n_samples());
13    println!("  Features: {}", dataset.n_features());
14    println!("  Feature names: {:?}", dataset.feature_names);
15
16    // Access data and target
17    println!("\nFirst 3 samples:");
18    for i in 0..3 {
19        let features = dataset.data.row(i);
20        let target = dataset.target.as_ref().map(|t| t[i]);
21        println!(
22            "  Sample {}: Features = {:?}, Target = {:?}",
23            i, features, target
24        );
25    }
26
27    Ok(())
28}

examples/dataset_loaders.rs (line 64)

61fn print_dataset_info(dataset: &Dataset, name: &str) {
62    println!("=== {} Dataset ===", name);
63    println!("Number of samples: {}", dataset.n_samples());
64    println!("Number of features: {}", dataset.n_features());
65
66    if let Some(feature_names) = &dataset.feature_names {
67        println!(
68            "Features: {:?}",
69            &feature_names[0..std::cmp::min(5, feature_names.len())]
70        );
71        if feature_names.len() > 5 {
72            println!("... and {} more", feature_names.len() - 5);
73        }
74    }
75
76    if let Some(target) = &dataset.target {
77        println!("Target shape: {}", target.len());
78
79        if let Some(target_names) = &dataset.target_names {
80            println!("Target classes: {:?}", target_names);
81        }
82    }
83
84    for (key, value) in &dataset.metadata {
85        println!("Metadata - {}: {}", key, value);
86    }
87}

examples/data_generators.rs (line 26)

5fn main() -> Result<(), Box<dyn std::error::Error>> {
6    println!("Creating synthetic datasets...\n");
7
8    // Generate classification dataset
9    let n_samples = 100;
10    let n_features = 5;
11
12    let classification_data = make_classification(
13        n_samples,
14        n_features,
15        3,        // 3 classes
16        2,        // 2 clusters per class
17        3,        // 3 informative features
18        Some(42), // random seed
19    )?;
20
21    // Train-test split
22    let (train, test) = classification_data.train_test_split(0.2, Some(42))?;
23
24    println!("Classification dataset:");
25    println!("  Total samples: {}", classification_data.n_samples());
26    println!("  Features: {}", classification_data.n_features());
27    println!("  Training samples: {}", train.n_samples());
28    println!("  Test samples: {}", test.n_samples());
29
30    // Generate regression dataset
31    let regression_data = make_regression(
32        n_samples,
33        n_features,
34        3,   // 3 informative features
35        0.5, // noise level
36        Some(42),
37    )?;
38
39    println!("\nRegression dataset:");
40    println!("  Samples: {}", regression_data.n_samples());
41    println!("  Features: {}", regression_data.n_features());
42
43    // Normalize the data (in-place)
44    let mut data_copy = regression_data.data.clone();
45    normalize(&mut data_copy);
46    println!("  Data normalized successfully");
47
48    // Generate clustering data (blobs)
49    let clustering_data = make_blobs(
50        n_samples,
51        2,   // 2 features for easy visualization
52        4,   // 4 clusters
53        0.8, // cluster standard deviation
54        Some(42),
55    )?;
56
57    println!("\nClustering dataset (blobs):");
58    println!("  Samples: {}", clustering_data.n_samples());
59    println!("  Features: {}", clustering_data.n_features());
60
61    // Find the number of clusters by finding the max value of target
62    let num_clusters = clustering_data.target.as_ref().map_or(0, |t| {
63        let mut max_val = -1.0;
64        for &val in t.iter() {
65            if val > max_val {
66                max_val = val;
67            }
68        }
69        (max_val as usize) + 1
70    });
71
72    println!("  Clusters: {}", num_clusters);
73
74    // Generate time series data
75    let time_series = make_time_series(
76        100,  // 100 time steps
77        3,    // 3 features/variables
78        true, // with trend
79        true, // with seasonality
80        0.2,  // noise level
81        Some(42),
82    )?;
83
84    println!("\nTime series dataset:");
85    println!("  Time steps: {}", time_series.n_samples());
86    println!("  Features: {}", time_series.n_features());
87
88    Ok(())
89}

Source

pub fn train_test_split( &self, test_size: f64, random_seed: Option<u64>, ) -> Result<(Dataset, Dataset)>

Split the dataset into training and test sets

Examples found in repository ?

examples/dataset_loaders.rs (line 31)

6fn main() {
7    // Check if a CSV file is provided as a command-line argument
8    let args: Vec<String> = env::args().collect();
9    if args.len() < 2 {
10        println!("Usage: {} <path_to_csv_file>", args[0]);
11        println!("Example: {} examples/sample_data.csv", args[0]);
12        return;
13    }
14
15    let file_path = &args[1];
16
17    // Verify the file exists
18    if !Path::new(file_path).exists() {
19        println!("Error: File '{}' does not exist", file_path);
20        return;
21    }
22
23    // Load CSV file
24    println!("Loading CSV file: {}", file_path);
25    match loaders::load_csv(file_path, true, None) {
26        Ok(dataset) => {
27            print_dataset_info(&dataset, "Loaded CSV");
28
29            // Split the dataset for demonstration
30            println!("\nDemonstrating train-test split...");
31            match dataset.train_test_split(0.2, Some(42)) {
32                Ok((train, test)) => {
33                    println!("Training set: {} samples", train.n_samples());
34                    println!("Test set: {} samples", test.n_samples());
35
36                    // Save as JSON for demonstration
37                    let json_path = format!("{}.json", file_path);
38                    println!("\nSaving training dataset to JSON: {}", json_path);
39                    if let Err(e) = loaders::save_json(&train, &json_path) {
40                        println!("Error saving JSON: {}", e);
41                    } else {
42                        println!("Successfully saved JSON file");
43
44                        // Load back the JSON file
45                        println!("\nLoading back from JSON file...");
46                        match loaders::load_json(&json_path) {
47                            Ok(loaded) => {
48                                print_dataset_info(&loaded, "Loaded JSON");
49                            }
50                            Err(e) => println!("Error loading JSON: {}", e),
51                        }
52                    }
53                }
54                Err(e) => println!("Error splitting dataset: {}", e),
55            }
56        }
57        Err(e) => println!("Error loading CSV: {}", e),
58    }
59}

More examples

Hide additional examples

examples/data_generators.rs (line 22)

5fn main() -> Result<(), Box<dyn std::error::Error>> {
6    println!("Creating synthetic datasets...\n");
7
8    // Generate classification dataset
9    let n_samples = 100;
10    let n_features = 5;
11
12    let classification_data = make_classification(
13        n_samples,
14        n_features,
15        3,        // 3 classes
16        2,        // 2 clusters per class
17        3,        // 3 informative features
18        Some(42), // random seed
19    )?;
20
21    // Train-test split
22    let (train, test) = classification_data.train_test_split(0.2, Some(42))?;
23
24    println!("Classification dataset:");
25    println!("  Total samples: {}", classification_data.n_samples());
26    println!("  Features: {}", classification_data.n_features());
27    println!("  Training samples: {}", train.n_samples());
28    println!("  Test samples: {}", test.n_samples());
29
30    // Generate regression dataset
31    let regression_data = make_regression(
32        n_samples,
33        n_features,
34        3,   // 3 informative features
35        0.5, // noise level
36        Some(42),
37    )?;
38
39    println!("\nRegression dataset:");
40    println!("  Samples: {}", regression_data.n_samples());
41    println!("  Features: {}", regression_data.n_features());
42
43    // Normalize the data (in-place)
44    let mut data_copy = regression_data.data.clone();
45    normalize(&mut data_copy);
46    println!("  Data normalized successfully");
47
48    // Generate clustering data (blobs)
49    let clustering_data = make_blobs(
50        n_samples,
51        2,   // 2 features for easy visualization
52        4,   // 4 clusters
53        0.8, // cluster standard deviation
54        Some(42),
55    )?;
56
57    println!("\nClustering dataset (blobs):");
58    println!("  Samples: {}", clustering_data.n_samples());
59    println!("  Features: {}", clustering_data.n_features());
60
61    // Find the number of clusters by finding the max value of target
62    let num_clusters = clustering_data.target.as_ref().map_or(0, |t| {
63        let mut max_val = -1.0;
64        for &val in t.iter() {
65            if val > max_val {
66                max_val = val;
67            }
68        }
69        (max_val as usize) + 1
70    });
71
72    println!("  Clusters: {}", num_clusters);
73
74    // Generate time series data
75    let time_series = make_time_series(
76        100,  // 100 time steps
77        3,    // 3 features/variables
78        true, // with trend
79        true, // with seasonality
80        0.2,  // noise level
81        Some(42),
82    )?;
83
84    println!("\nTime series dataset:");
85    println!("  Time steps: {}", time_series.n_samples());
86    println!("  Features: {}", time_series.n_features());
87
88    Ok(())
89}

Trait Implementations§

Source §

impl Clone for Dataset

Source §

fn clone(&self) -> Dataset

Returns a copy of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for Dataset

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl<'de> Deserialize<'de> for Dataset

Source §

fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where __D: Deserializer<'de>,

Deserialize this value from the given Serde deserializer. Read more

Source §

impl Serialize for Dataset

Source §

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

Serialize this value into the given Serde serializer. Read more

Auto Trait Implementations§

§

impl UnwindSafe for Dataset

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §