scirs2_datasets/lib.rs
1//! Datasets module for SciRS2
2//!
3//! This module provides dataset loading utilities similar to scikit-learn's datasets module.
4//! It includes toy datasets, sample datasets, time series datasets, data generators,
5//! and utilities for loading and processing datasets.
6//!
7//! # Features
8//!
9//! - **Toy datasets**: Classic datasets like Iris, Boston Housing, Breast Cancer, and Digits
10//! - **Data generators**: Create synthetic datasets for classification, regression, clustering, and time series
11//! - **Cross-validation utilities**: K-fold, stratified, and time series cross-validation
12//! - **Dataset utilities**: Train/test splitting, normalization, and metadata handling
13//! - **Caching**: Efficient caching system for downloaded datasets
14//! - **Registry**: Centralized registry for dataset metadata and locations
15//!
16//! # Examples
17//!
18//! ## Loading toy datasets
19//!
20//! ```rust
21//! use scirs2_datasets::{load_iris, load_boston};
22//!
23//! // Load the classic Iris dataset
24//! let iris = load_iris().unwrap();
25//! println!("Iris dataset: {} samples, {} features", iris.n_samples(), iris.n_features());
26//!
27//! // Load the Boston housing dataset
28//! let boston = load_boston().unwrap();
29//! println!("Boston dataset: {} samples, {} features", boston.n_samples(), boston.n_features());
30//! ```
31//!
32//! ## Generating synthetic datasets
33//!
34//! ```rust
35//! use scirs2_datasets::{make_classification, make_regression, make_blobs, make_spirals, make_moons};
36//!
37//! // Generate a classification dataset
38//! let classification = make_classification(100, 5, 3, 2, 4, Some(42)).unwrap();
39//! println!("Classification dataset: {} samples, {} features, {} classes",
40//! classification.n_samples(), classification.n_features(), 3);
41//!
42//! // Generate a regression dataset
43//! let regression = make_regression(50, 4, 3, 0.1, Some(42)).unwrap();
44//! println!("Regression dataset: {} samples, {} features",
45//! regression.n_samples(), regression.n_features());
46//!
47//! // Generate a clustering dataset
48//! let blobs = make_blobs(80, 3, 4, 1.0, Some(42)).unwrap();
49//! println!("Blobs dataset: {} samples, {} features, {} clusters",
50//! blobs.n_samples(), blobs.n_features(), 4);
51//!
52//! // Generate non-linear patterns
53//! let spirals = make_spirals(200, 2, 0.1, Some(42)).unwrap();
54//! let moons = make_moons(150, 0.05, Some(42)).unwrap();
55//! ```
56//!
57//! ## Cross-validation
58//!
59//! ```rust
60//! use scirs2_datasets::{load_iris, k_fold_split, stratified_k_fold_split};
61//!
62//! let iris = load_iris().unwrap();
63//!
64//! // K-fold cross-validation
65//! let k_folds = k_fold_split(iris.n_samples(), 5, true, Some(42)).unwrap();
66//! println!("Created {} folds for K-fold CV", k_folds.len());
67//!
68//! // Stratified K-fold cross-validation
69//! if let Some(target) = &iris.target {
70//! let stratified_folds = stratified_k_fold_split(target, 5, true, Some(42)).unwrap();
71//! println!("Created {} stratified folds", stratified_folds.len());
72//! }
73//! ```
74//!
75//! ## Dataset manipulation
76//!
77//! ```rust
78//! use scirs2_datasets::{load_iris, Dataset};
79//!
80//! let iris = load_iris().unwrap();
81//!
82//! // Access dataset properties
83//! println!("Dataset: {} samples, {} features", iris.n_samples(), iris.n_features());
84//! if let Some(feature_names) = iris.feature_names() {
85//! println!("Features: {:?}", feature_names);
86//! }
87//! ```
88
89#![warn(missing_docs)]
90
91pub mod cache;
92pub mod error;
93pub mod generators;
94pub mod loaders;
95pub mod registry;
96pub mod sample;
97pub mod time_series;
98pub mod toy;
99/// Core utilities for working with datasets
100///
101/// This module provides the Dataset struct and helper functions for
102/// manipulating and transforming datasets.
103pub mod utils;
104
105// Temporary module to test method resolution conflict
106mod method_resolution_test;
107
108// Re-export commonly used functionality
109pub use cache::{
110 get_cache_dir, BatchOperations, BatchResult, CacheFileInfo, CacheManager, CacheStats,
111 DatasetCache, DetailedCacheStats,
112};
113pub use generators::{
114 add_time_series_noise, inject_missing_data, inject_outliers, make_anisotropic_blobs,
115 make_blobs, make_circles, make_classification, make_corrupted_dataset,
116 make_hierarchical_clusters, make_moons, make_regression, make_spirals, make_swiss_roll,
117 make_time_series, MissingPattern, OutlierType,
118};
119pub use registry::*;
120pub use sample::*;
121pub use toy::*;
122pub use utils::{
123 create_balanced_dataset, create_binned_features, generate_synthetic_samples, importance_sample,
124 k_fold_split, min_max_scale, polynomial_features, random_oversample, random_sample,
125 random_undersample, robust_scale, statistical_features, stratified_k_fold_split,
126 stratified_sample, time_series_split, BalancingStrategy, BinningStrategy, CrossValidationFolds,
127 Dataset,
128};