scirs2_datasets/
lib.rs

1#![allow(deprecated)]
2//! # SciRS2 Datasets - Dataset Loading and Generation
3//!
4//! **scirs2-datasets** provides dataset utilities modeled after scikit-learn's `datasets` module,
5//! offering toy datasets (Iris, Boston, MNIST), synthetic data generators, cross-validation splitters,
6//! and data preprocessing utilities for machine learning workflows.
7//!
8//! ## 🎯 Key Features
9//!
10//! - **Toy Datasets**: Classic datasets (Iris, Boston Housing, Breast Cancer, Digits)
11//! - **Data Generators**: Synthetic data for classification, regression, clustering
12//! - **Cross-Validation**: K-fold, stratified, time series CV splitters
13//! - **Preprocessing**: Train/test split, normalization, feature scaling
14//! - **Caching**: Efficient disk caching for downloaded datasets
15//!
16//! ## 📦 Module Overview
17//!
18//! | SciRS2 Function | scikit-learn Equivalent | Description |
19//! |-----------------|-------------------------|-------------|
20//! | `load_iris` | `sklearn.datasets.load_iris` | Classic Iris classification dataset |
21//! | `load_boston` | `sklearn.datasets.load_boston` | Boston housing regression dataset |
22//! | `make_classification` | `sklearn.datasets.make_classification` | Synthetic classification data |
23//! | `make_regression` | `sklearn.datasets.make_regression` | Synthetic regression data |
24//! | `make_blobs` | `sklearn.datasets.make_blobs` | Synthetic clustering data |
25//! | `k_fold_split` | `sklearn.model_selection.KFold` | K-fold cross-validation |
26//!
27//! ## 🚀 Quick Start
28//!
29//! ```toml
30//! [dependencies]
31//! scirs2-datasets = "0.1.0-rc.2"
32//! ```
33//!
34//! ```rust
35//! use scirs2_datasets::{load_iris, make_classification};
36//!
37//! // Load classic Iris dataset
38//! let iris = load_iris().unwrap();
39//! println!("{} samples, {} features", iris.n_samples(), iris.n_features());
40//!
41//! // Generate synthetic classification data
42//! let data = make_classification(100, 5, 3, 2, 4, Some(42)).unwrap();
43//! ```
44//!
45//! ## 🔒 Version: 0.1.0-rc.2 (October 03, 2025)
46//!
47//! # Examples
48//!
49//! ## Loading toy datasets
50//!
51//! ```rust
52//! use scirs2_datasets::{load_iris, load_boston};
53//!
54//! // Load the classic Iris dataset
55//! let iris = load_iris().unwrap();
56//! println!("Iris dataset: {} samples, {} features", iris.n_samples(), iris.n_features());
57//!
58//! // Load the Boston housing dataset
59//! let boston = load_boston().unwrap();
60//! println!("Boston dataset: {} samples, {} features", boston.n_samples(), boston.n_features());
61//! ```
62//!
63//! ## Generating synthetic datasets
64//!
65//! ```rust
66//! use scirs2_datasets::{make_classification, make_regression, make_blobs, make_spirals, make_moons};
67//!
68//! // Generate a classification dataset
69//! let classification = make_classification(100, 5, 3, 2, 4, Some(42)).unwrap();
70//! println!("Classification dataset: {} samples, {} features, {} classes",
71//!          classification.n_samples(), classification.n_features(), 3);
72//!
73//! // Generate a regression dataset
74//! let regression = make_regression(50, 4, 3, 0.1, Some(42)).unwrap();
75//! println!("Regression dataset: {} samples, {} features",
76//!          regression.n_samples(), regression.n_features());
77//!
78//! // Generate a clustering dataset
79//! let blobs = make_blobs(80, 3, 4, 1.0, Some(42)).unwrap();
80//! println!("Blobs dataset: {} samples, {} features, {} clusters",
81//!          blobs.n_samples(), blobs.n_features(), 4);
82//!
83//! // Generate non-linear patterns
84//! let spirals = make_spirals(200, 2, 0.1, Some(42)).unwrap();
85//! let moons = make_moons(150, 0.05, Some(42)).unwrap();
86//! ```
87//!
88//! ## Cross-validation
89//!
90//! ```rust
91//! use scirs2_datasets::{load_iris, k_fold_split, stratified_k_fold_split};
92//!
93//! let iris = load_iris().unwrap();
94//!
95//! // K-fold cross-validation
96//! let k_folds = k_fold_split(iris.n_samples(), 5, true, Some(42)).unwrap();
97//! println!("Created {} folds for K-fold CV", k_folds.len());
98//!
99//! // Stratified K-fold cross-validation
100//! if let Some(target) = &iris.target {
101//!     let stratified_folds = stratified_k_fold_split(target, 5, true, Some(42)).unwrap();
102//!     println!("Created {} stratified folds", stratified_folds.len());
103//! }
104//! ```
105//!
106//! ## Dataset manipulation
107//!
108//! ```rust
109//! use scirs2_datasets::{load_iris, Dataset};
110//!
111//! let iris = load_iris().unwrap();
112//!
113//! // Access dataset properties
114//! println!("Dataset: {} samples, {} features", iris.n_samples(), iris.n_features());
115//! if let Some(featurenames) = iris.featurenames() {
116//!     println!("Features: {:?}", featurenames);
117//! }
118//! ```
119
120#![warn(missing_docs)]
121
122pub mod advanced_generators;
123pub mod benchmarks;
124pub mod cache;
125pub mod cloud;
126pub mod distributed;
127pub mod domain_specific;
128pub mod error;
129pub mod explore;
130pub mod external;
131pub mod generators;
132pub mod gpu;
133pub mod gpu_optimization;
134pub mod loaders;
135pub mod ml_integration;
136pub mod real_world;
137pub mod registry;
138pub mod sample;
139pub mod streaming;
140pub mod time_series;
141pub mod toy;
142/// Core utilities for working with datasets
143///
144/// This module provides the Dataset struct and helper functions for
145/// manipulating and transforming datasets.
146pub mod utils;
147
148/// API stability guarantees and compatibility documentation
149///
150/// This module defines the API stability levels and compatibility guarantees
151/// for the scirs2-datasets crate.
152pub mod stability;
153
154// Temporary module to test method resolution conflict
155mod method_resolution_test;
156
157pub mod adaptive_streaming_engine;
158pub mod neuromorphic_data_processor;
159pub mod quantum_enhanced_generators;
160pub mod quantum_neuromorphic_fusion;
161
162// Re-export commonly used functionality
163pub use adaptive_streaming_engine::{
164    create_adaptive_engine, create_adaptive_engine_with_config, AdaptiveStreamConfig,
165    AdaptiveStreamingEngine, AlertSeverity, AlertType, ChunkMetadata, DataCharacteristics,
166    MemoryStrategy, PatternType, PerformanceMetrics, QualityAlert, QualityMetrics,
167    StatisticalMoments, StreamChunk, TrendDirection, TrendIndicators,
168};
169pub use advanced_generators::{
170    make_adversarial_examples, make_anomaly_dataset, make_continual_learning_dataset,
171    make_domain_adaptation_dataset, make_few_shot_dataset, make_multitask_dataset,
172    AdversarialConfig, AnomalyConfig, AnomalyType, AttackMethod, ContinualLearningDataset,
173    DomainAdaptationConfig, DomainAdaptationDataset, FewShotDataset, MultiTaskConfig,
174    MultiTaskDataset, TaskType,
175};
176pub use benchmarks::{BenchmarkResult, BenchmarkRunner, BenchmarkSuite, PerformanceComparison};
177pub use cloud::{
178    presets::{azure_client, gcs_client, public_s3_client, s3_client, s3_compatible_client},
179    public_datasets::{AWSOpenData, AzureOpenData, GCPPublicData},
180    CloudClient, CloudConfig, CloudCredentials, CloudProvider,
181};
182pub use distributed::{DistributedConfig, DistributedProcessor, ScalingMethod, ScalingParameters};
183pub use domain_specific::{
184    astronomy::StellarDatasets,
185    climate::ClimateDatasets,
186    convenience::{
187        list_domain_datasets, load_atmospheric_chemistry, load_climate_data, load_exoplanets,
188        load_gene_expression, load_stellar_classification,
189    },
190    genomics::GenomicsDatasets,
191    DomainConfig, QualityFilters,
192};
193pub use explore::{
194    convenience::{explore, export_summary, info, quick_summary},
195    DatasetExplorer, DatasetSummary, ExploreConfig, FeatureStatistics, InferredDataType,
196    OutputFormat, QualityAssessment,
197};
198#[cfg(not(feature = "download"))]
199pub use external::convenience::{load_github_dataset_sync, load_uci_dataset_sync};
200pub use external::{
201    convenience::{list_uci_datasets, load_from_url_sync},
202    repositories::{GitHubRepository, KaggleRepository, UCIRepository},
203    ExternalClient, ExternalConfig, ProgressCallback,
204};
205pub use ml_integration::{
206    convenience::{create_experiment, cv_split, prepare_for_ml, train_test_split},
207    CrossValidationResults, DataSplit, MLExperiment, MLPipeline, MLPipelineConfig,
208    ScalingMethod as MLScalingMethod,
209};
210
211pub use cache::{
212    get_cachedir, BatchOperations, BatchResult, CacheFileInfo, CacheManager, CacheStats,
213    DatasetCache, DetailedCacheStats,
214};
215#[cfg(feature = "download")]
216pub use external::convenience::{load_from_url, load_github_dataset, load_uci_dataset};
217pub use generators::{
218    add_time_series_noise, benchmark_gpu_vs_cpu, get_gpu_info, gpu_is_available,
219    inject_missing_data, inject_outliers, make_anisotropic_blobs, make_blobs, make_blobs_gpu,
220    make_circles, make_classification, make_classification_gpu, make_corrupted_dataset, make_helix,
221    make_hierarchical_clusters, make_intersecting_manifolds, make_manifold, make_moons,
222    make_regression, make_regression_gpu, make_s_curve, make_severed_sphere, make_spirals,
223    make_swiss_roll, make_swiss_roll_advanced, make_time_series, make_torus, make_twin_peaks,
224    ManifoldConfig, ManifoldType, MissingPattern, OutlierType,
225};
226pub use gpu::{
227    get_optimal_gpu_config, is_cuda_available, is_opencl_available, list_gpu_devices,
228    make_blobs_auto_gpu, make_classification_auto_gpu, make_regression_auto_gpu, GpuBackend,
229    GpuBenchmark, GpuBenchmarkResults, GpuConfig, GpuContext, GpuDeviceInfo, GpuMemoryConfig,
230};
231pub use gpu_optimization::{
232    benchmark_advanced_performance, generate_advanced_matrix, AdvancedGpuOptimizer,
233    AdvancedKernelConfig, BenchmarkResult as AdvancedBenchmarkResult, DataLayout,
234    LoadBalancingMethod, MemoryAccessPattern, PerformanceBenchmarkResults, SpecializationLevel,
235    VectorizationStrategy,
236};
237pub use loaders::{
238    load_csv, load_csv_legacy, load_csv_parallel, load_csv_streaming, load_json, load_raw,
239    save_json, CsvConfig, DatasetChunkIterator, StreamingConfig,
240};
241pub use neuromorphic_data_processor::{
242    create_neuromorphic_processor, create_neuromorphic_processor_with_topology, NetworkTopology,
243    NeuromorphicProcessor, NeuromorphicTransform, SynapticPlasticity,
244};
245pub use quantum_enhanced_generators::{
246    make_quantum_blobs, make_quantum_classification, make_quantum_regression,
247    QuantumDatasetGenerator,
248};
249pub use quantum_neuromorphic_fusion::{
250    create_fusion_with_params, create_quantum_neuromorphic_fusion, QuantumBioFusionResult,
251    QuantumInterference, QuantumNeuromorphicFusion,
252};
253pub use real_world::{
254    list_real_world_datasets, load_adult, load_california_housing, load_heart_disease,
255    load_red_wine_quality, load_titanic, RealWorldConfig, RealWorldDatasets,
256};
257pub use registry::{get_registry, load_dataset_byname, DatasetMetadata, DatasetRegistry};
258pub use sample::*;
259pub use streaming::{
260    stream_classification, stream_csv, stream_regression, DataChunk, StreamConfig, StreamProcessor,
261    StreamStats, StreamTransformer, StreamingIterator,
262};
263pub use toy::*;
264pub use utils::{
265    analyze_dataset_advanced, create_balanced_dataset, create_binned_features,
266    generate_synthetic_samples, importance_sample, k_fold_split, min_max_scale,
267    polynomial_features, quick_quality_assessment, random_oversample, random_sample,
268    random_undersample, robust_scale, statistical_features, stratified_k_fold_split,
269    stratified_sample, time_series_split, AdvancedDatasetAnalyzer, AdvancedQualityMetrics,
270    BalancingStrategy, BinningStrategy, CorrelationInsights, CrossValidationFolds, Dataset,
271    NormalityAssessment,
272};