Skip to main content

scirs2_datasets/
lib.rs

1//! # SciRS2 Datasets - Dataset Loading and Generation
2//!
3//! **scirs2-datasets** provides dataset utilities modeled after scikit-learn's `datasets` module,
4//! offering toy datasets (Iris, Boston, MNIST), synthetic data generators, cross-validation splitters,
5//! and data preprocessing utilities for machine learning workflows.
6//!
7//! ## 🎯 Key Features
8//!
9//! - **Toy Datasets**: Classic datasets (Iris, Boston Housing, Breast Cancer, Digits)
10//! - **Data Generators**: Synthetic data for classification, regression, clustering
11//! - **Cross-Validation**: K-fold, stratified, time series CV splitters
12//! - **Preprocessing**: Train/test split, normalization, feature scaling
13//! - **Caching**: Efficient disk caching for downloaded datasets
14//!
15//! ## 📦 Module Overview
16//!
17//! | SciRS2 Function | scikit-learn Equivalent | Description |
18//! |-----------------|-------------------------|-------------|
19//! | `load_iris` | `sklearn.datasets.load_iris` | Classic Iris classification dataset |
20//! | `load_boston` | `sklearn.datasets.load_boston` | Boston housing regression dataset |
21//! | `make_classification` | `sklearn.datasets.make_classification` | Synthetic classification data |
22//! | `make_regression` | `sklearn.datasets.make_regression` | Synthetic regression data |
23//! | `make_blobs` | `sklearn.datasets.make_blobs` | Synthetic clustering data |
24//! | `k_fold_split` | `sklearn.model_selection.KFold` | K-fold cross-validation |
25//!
26//! ## 🚀 Quick Start
27//!
28//! ```toml
29//! [dependencies]
30//! scirs2-datasets = "0.4.0"
31//! ```
32//!
33//! ```rust
34//! use scirs2_datasets::{load_iris, make_classification};
35//!
36//! // Load classic Iris dataset
37//! let iris = load_iris().expect("Operation failed");
38//! println!("{} samples, {} features", iris.n_samples(), iris.n_features());
39//!
40//! // Generate synthetic classification data
41//! let data = make_classification(100, 5, 3, 2, 4, Some(42)).expect("Operation failed");
42//! ```
43//!
44//! ## 🔒 Version: 0.4.0
45//!
46//! ### v0.4.0 New Features
47//!
48//! - **Lazy Loading**: Memory-mapped datasets with zero-copy views
49//! - **Data Augmentation**: GPU-accelerated augmentation pipeline
50//! - **Parallel Preprocessing**: Multi-threaded preprocessing with work-stealing
51//! - **Distributed Loading**: Shard-aware loading for distributed training
52//! - **Format Support**: Parquet, Arrow, HDF5 integration via scirs2-io
53//! - **Benchmarks**: Comprehensive comparison with PyTorch DataLoader
54//!
55//! # Examples
56//!
57//! ## Loading toy datasets
58//!
59//! ```rust
60//! use scirs2_datasets::{load_iris, load_boston};
61//!
62//! // Load the classic Iris dataset
63//! let iris = load_iris().expect("Operation failed");
64//! println!("Iris dataset: {} samples, {} features", iris.n_samples(), iris.n_features());
65//!
66//! // Load the Boston housing dataset
67//! let boston = load_boston().expect("Operation failed");
68//! println!("Boston dataset: {} samples, {} features", boston.n_samples(), boston.n_features());
69//! ```
70//!
71//! ## Generating synthetic datasets
72//!
73//! ```rust
74//! use scirs2_datasets::{make_classification, make_regression, make_blobs, make_spirals, make_moons};
75//!
76//! // Generate a classification dataset
77//! let classification = make_classification(100, 5, 3, 2, 4, Some(42)).expect("Operation failed");
78//! println!("Classification dataset: {} samples, {} features, {} classes",
79//!          classification.n_samples(), classification.n_features(), 3);
80//!
81//! // Generate a regression dataset
82//! let regression = make_regression(50, 4, 3, 0.1, Some(42)).expect("Operation failed");
83//! println!("Regression dataset: {} samples, {} features",
84//!          regression.n_samples(), regression.n_features());
85//!
86//! // Generate a clustering dataset
87//! let blobs = make_blobs(80, 3, 4, 1.0, Some(42)).expect("Operation failed");
88//! println!("Blobs dataset: {} samples, {} features, {} clusters",
89//!          blobs.n_samples(), blobs.n_features(), 4);
90//!
91//! // Generate non-linear patterns
92//! let spirals = make_spirals(200, 2, 0.1, Some(42)).expect("Operation failed");
93//! let moons = make_moons(150, 0.05, Some(42)).expect("Operation failed");
94//! ```
95//!
96//! ## Cross-validation
97//!
98//! ```rust
99//! use scirs2_datasets::{load_iris, k_fold_split, stratified_k_fold_split};
100//!
101//! let iris = load_iris().expect("Operation failed");
102//!
103//! // K-fold cross-validation
104//! let k_folds = k_fold_split(iris.n_samples(), 5, true, Some(42)).expect("Operation failed");
105//! println!("Created {} folds for K-fold CV", k_folds.len());
106//!
107//! // Stratified K-fold cross-validation
108//! if let Some(target) = &iris.target {
109//!     let stratified_folds = stratified_k_fold_split(target, 5, true, Some(42)).expect("Operation failed");
110//!     println!("Created {} stratified folds", stratified_folds.len());
111//! }
112//! ```
113//!
114//! ## Dataset manipulation
115//!
116//! ```rust
117//! use scirs2_datasets::{load_iris, Dataset};
118//!
119//! let iris = load_iris().expect("Operation failed");
120//!
121//! // Access dataset properties
122//! println!("Dataset: {} samples, {} features", iris.n_samples(), iris.n_features());
123//! if let Some(featurenames) = iris.featurenames() {
124//!     println!("Features: {:?}", featurenames);
125//! }
126//! ```
127
128#![warn(missing_docs)]
129
130pub mod advanced_generators;
131pub mod benchmarks;
132pub mod cache;
133pub mod cloud;
134pub mod distributed;
135pub mod domain_specific;
136pub mod error;
137pub mod explore;
138pub mod external;
139pub mod generators;
140pub mod gpu;
141pub mod gpu_optimization;
142pub mod loaders;
143pub mod ml_integration;
144pub mod real_world;
145pub mod registry;
146pub mod sample;
147pub mod streaming;
148pub mod time_series;
149pub mod toy;
150/// Core utilities for working with datasets
151///
152/// This module provides the Dataset struct and helper functions for
153/// manipulating and transforming datasets.
154pub mod utils;
155
156/// Standard benchmark datasets (fully embedded, no download required)
157///
158/// Provides well-known ML datasets: Iris, Wine, Breast Cancer, Digits, Boston.
159/// Each returns a `DatasetResult` with data, target, feature names, and description.
160pub mod standard;
161
162/// API stability guarantees and compatibility documentation
163///
164/// This module defines the API stability levels and compatibility guarantees
165/// for the scirs2-datasets crate.
166pub mod stability;
167
168/// Pure Rust platform directory detection (replaces `dirs` crate for COOLJAPAN Pure Rust policy)
169pub mod platform_dirs;
170
171// Temporary module to test method resolution conflict
172mod method_resolution_test;
173
174pub mod adaptive_streaming_engine;
175pub mod neuromorphic_data_processor;
176pub mod quantum_enhanced_generators;
177pub mod quantum_neuromorphic_fusion;
178
179// v0.2.0 modules
180/// Lazy loading and memory-mapped datasets
181///
182/// Provides zero-copy dataset access with adaptive chunking for memory-efficient
183/// processing of datasets larger than available RAM.
184#[cfg(feature = "lazy-loading")]
185pub mod lazy_loading;
186
187/// Data augmentation pipeline with GPU support
188///
189/// Composable augmentation transforms for images, audio, and tabular data
190/// with optional GPU acceleration for improved performance.
191#[cfg(feature = "augmentation")]
192pub mod augmentation;
193
194/// Parallel data preprocessing
195///
196/// Multi-threaded preprocessing pipeline with work-stealing scheduler and
197/// backpressure handling for optimal throughput.
198pub mod parallel_preprocessing;
199
200/// Distributed dataset loading
201///
202/// Shard-aware loading for distributed training with multi-node coordination
203/// and distributed caching.
204#[cfg(feature = "distributed")]
205pub mod distributed_loading;
206
207/// Format support (Parquet, Arrow, HDF5)
208///
209/// Integration with scirs2-io for reading and writing datasets in modern
210/// columnar and scientific formats.
211pub mod formats;
212
213// Benchmarks module (named to avoid conflict with benchmarks)
214pub mod benchmarks_module;
215// HuggingFace Hub metadata integration
216pub mod hub_metadata;
217// Dataset sharding API
218pub mod sharding;
219// Mini-batch sampling
220pub mod sampling;
221// Streaming CSV loader
222pub mod streaming_csv;
223
224// Re-export commonly used functionality
225pub use adaptive_streaming_engine::{
226    create_adaptive_engine, create_adaptive_engine_with_config, AdaptiveStreamConfig,
227    AdaptiveStreamingEngine, AlertSeverity, AlertType, ChunkMetadata, DataCharacteristics,
228    MemoryStrategy, PatternType, PerformanceMetrics, QualityAlert, QualityMetrics,
229    StatisticalMoments, StreamChunk, TrendDirection, TrendIndicators,
230};
231pub use advanced_generators::{
232    make_adversarial_examples, make_anomaly_dataset, make_continual_learning_dataset,
233    make_domain_adaptation_dataset, make_few_shot_dataset, make_multitask_dataset,
234    AdversarialConfig, AnomalyConfig, AnomalyType, AttackMethod, ContinualLearningDataset,
235    DomainAdaptationConfig, DomainAdaptationDataset, FewShotDataset, MultiTaskConfig,
236    MultiTaskDataset, TaskType,
237};
238pub use benchmarks::{BenchmarkResult, BenchmarkRunner, BenchmarkSuite, PerformanceComparison};
239pub use cloud::{
240    presets::{azure_client, gcs_client, public_s3_client, s3_client, s3_compatible_client},
241    public_datasets::{AWSOpenData, AzureOpenData, GCPPublicData},
242    CloudClient, CloudConfig, CloudCredentials, CloudProvider,
243};
244pub use distributed::{DistributedConfig, DistributedProcessor, ScalingMethod, ScalingParameters};
245pub use domain_specific::{
246    astronomy::StellarDatasets,
247    climate::ClimateDatasets,
248    convenience::{
249        list_domain_datasets, load_atmospheric_chemistry, load_climate_data, load_exoplanets,
250        load_gene_expression, load_stellar_classification,
251    },
252    genomics::GenomicsDatasets,
253    DomainConfig, QualityFilters,
254};
255pub use explore::{
256    convenience::{explore, export_summary, info, quick_summary},
257    DatasetExplorer, DatasetSummary, ExploreConfig, FeatureStatistics, InferredDataType,
258    OutputFormat, QualityAssessment,
259};
260#[cfg(not(feature = "download"))]
261pub use external::convenience::{load_github_dataset_sync, load_uci_dataset_sync};
262pub use external::{
263    convenience::{list_uci_datasets, load_from_url_sync},
264    repositories::{GitHubRepository, KaggleRepository, UCIRepository},
265    ExternalClient, ExternalConfig, ProgressCallback,
266};
267pub use ml_integration::{
268    convenience::{create_experiment, cv_split, prepare_for_ml, train_test_split},
269    CrossValidationResults, DataSplit, MLExperiment, MLPipeline, MLPipelineConfig,
270    ScalingMethod as MLScalingMethod,
271};
272
273pub use cache::{
274    get_cachedir, BatchOperations, BatchResult, CacheFileInfo, CacheManager, CacheStats,
275    DatasetCache, DetailedCacheStats,
276};
277#[cfg(feature = "download")]
278pub use external::convenience::{load_from_url, load_github_dataset, load_uci_dataset};
279pub use generators::{
280    add_time_series_noise, benchmark_gpu_vs_cpu, get_gpu_info, gpu_is_available,
281    inject_missing_data, inject_outliers, make_anisotropic_blobs, make_blobs, make_blobs_gpu,
282    make_circles, make_classification, make_classification_gpu, make_corrupted_dataset, make_helix,
283    make_hierarchical_clusters, make_intersecting_manifolds, make_manifold, make_moons,
284    make_regression, make_regression_gpu, make_s_curve, make_severed_sphere, make_spirals,
285    make_swiss_roll, make_swiss_roll_advanced, make_time_series, make_torus, make_twin_peaks,
286    ManifoldConfig, ManifoldType, MissingPattern, OutlierType,
287};
288// Time series generators
289pub use generators::time_series::{
290    make_ar_process, make_random_walk, make_seasonal, make_sine_wave,
291};
292// Graph generators
293pub use generators::graph::{
294    make_barabasi_albert, make_karate_club, make_random_graph, make_watts_strogatz,
295};
296// Sparse matrix generators
297pub use generators::sparse::{make_sparse_banded, make_sparse_laplacian, make_sparse_spd};
298// Classification generators
299pub use generators::classification::{
300    make_classification_enhanced, make_hastie_10_2, make_multilabel_classification,
301    ClassificationConfig, MultilabelConfig, MultilabelDataset,
302};
303// Regression generators
304pub use generators::regression::{
305    make_friedman1, make_friedman2, make_friedman3, make_low_rank_matrix, make_sparse_uncorrelated,
306};
307// Structured generators
308pub use generators::structured::{
309    make_biclusters, make_checkerboard, make_sparse_coded_signal, make_sparse_spd_matrix,
310    make_spd_matrix,
311};
312// Advanced generators: low-rank, sparse classification, multilabel, heterogeneous, concept drift
313pub use generators::concept_drift::{
314    detect_drift_accuracy, make_concept_drift, ConceptDriftConfig, ConceptDriftDataset, DriftType,
315};
316pub use generators::heterogeneous::{
317    encode_one_hot, make_heterogeneous, FeatureType, HeteroConfig, HeteroDataset,
318    HeteroFeatureValue,
319};
320pub use generators::low_rank::{
321    make_low_rank as make_low_rank_completion, observed_rmse, reconstruction_error, LowRankConfig,
322    LowRankDataset,
323};
324pub use generators::multilabel_advanced::{
325    hamming_loss, label_cardinality, label_density_score, make_advanced_multilabel_classification,
326    AdvancedMultilabelConfig, AdvancedMultilabelDataset,
327};
328pub use generators::sparse_classification::{
329    make_sparse_classification as make_sparse_class, sparsity_ratio, SparseClassConfig,
330    SparseClassDataset,
331};
332// Sharding (data-carrying)
333pub use sharding::{merge_shards, shard_dataset, shuffled_shard, stratified_shard, DatasetShard};
334// Mini-batch sampling
335pub use sampling::{iter_batches, MiniBatch, MiniBatchSampler, SamplerConfig, SamplerStrategy};
336// Standard datasets
337pub use gpu::{
338    get_optimal_gpu_config, is_cuda_available, is_opencl_available, list_gpu_devices,
339    make_blobs_auto_gpu, make_classification_auto_gpu, make_regression_auto_gpu, GpuBackend,
340    GpuBenchmark, GpuBenchmarkResults, GpuConfig, GpuContext, GpuDeviceInfo, GpuMemoryConfig,
341};
342pub use gpu_optimization::{
343    benchmark_advanced_performance, generate_advanced_matrix, AdvancedGpuOptimizer,
344    AdvancedKernelConfig, BenchmarkResult as AdvancedBenchmarkResult, DataLayout,
345    LoadBalancingMethod, MemoryAccessPattern, PerformanceBenchmarkResults, SpecializationLevel,
346    VectorizationStrategy,
347};
348pub use loaders::{
349    load_csv, load_csv_legacy, load_csv_parallel, load_csv_streaming, load_json, load_raw,
350    save_json, CsvConfig, DatasetChunkIterator, StreamingConfig,
351};
352pub use neuromorphic_data_processor::{
353    create_neuromorphic_processor, create_neuromorphic_processor_with_topology, NetworkTopology,
354    NeuromorphicProcessor, NeuromorphicTransform, SynapticPlasticity,
355};
356pub use quantum_enhanced_generators::{
357    make_quantum_blobs, make_quantum_classification, make_quantum_regression,
358    QuantumDatasetGenerator,
359};
360pub use quantum_neuromorphic_fusion::{
361    create_fusion_with_params, create_quantum_neuromorphic_fusion, QuantumBioFusionResult,
362    QuantumInterference, QuantumNeuromorphicFusion,
363};
364pub use real_world::{
365    list_real_world_datasets, load_adult, load_california_housing, load_heart_disease,
366    load_red_wine_quality, load_titanic, RealWorldConfig, RealWorldDatasets,
367};
368pub use registry::{get_registry, load_dataset_byname, DatasetMetadata, DatasetRegistry};
369pub use sample::*;
370pub use standard::{
371    load_boston as load_boston_full, load_breast_cancer as load_breast_cancer_full,
372    load_digits as load_digits_full, load_iris as load_iris_full, load_wine, DatasetResult,
373};
374pub use streaming::{
375    stream_classification, stream_csv, stream_regression, DataChunk, StreamConfig, StreamProcessor,
376    StreamStats, StreamTransformer, StreamingIterator,
377};
378pub use toy::*;
379pub use utils::{
380    analyze_dataset_advanced, create_balanced_dataset, create_binned_features,
381    generate_synthetic_samples, importance_sample, k_fold_split, min_max_scale,
382    polynomial_features, quick_quality_assessment, random_oversample, random_sample,
383    random_undersample, robust_scale, statistical_features, stratified_k_fold_split,
384    stratified_sample, time_series_split, AdvancedDatasetAnalyzer, AdvancedQualityMetrics,
385    BalancingStrategy, BinningStrategy, CorrelationInsights, CrossValidationFolds, Dataset,
386    NormalityAssessment,
387};
388
389// v0.2.0 re-exports
390#[cfg(feature = "lazy-loading")]
391pub use lazy_loading::{
392    from_binary as lazy_from_binary, from_binary_with_config as lazy_from_binary_with_config,
393    LazyChunkIterator, LazyDataset, LazyLoadConfig, MmapDataset,
394};
395
396#[cfg(feature = "augmentation")]
397pub use augmentation::{
398    standard_image_augmentation, standard_tabular_augmentation, AugmentationPipeline, Brightness,
399    Contrast, GaussianNoise, HorizontalFlip, Mixup, RandomFeatureScale, RandomRotation90,
400    Transform, VerticalFlip,
401};
402
403pub use parallel_preprocessing::{
404    create_pipeline, create_pipeline_with_config, ParallelConfig, ParallelPipeline, PreprocessFn,
405};
406
407#[cfg(feature = "distributed")]
408pub use distributed_loading::{
409    create_loader, create_loader_with_config, DistributedCache,
410    DistributedConfig as DistributedLoadingConfig, DistributedLoader, Shard,
411};
412
413pub use formats::{CompressionCodec, FormatConfig, FormatType};
414
415#[cfg(feature = "formats")]
416pub use formats::{
417    read_auto, read_hdf5, read_parquet, write_hdf5, write_parquet, FormatConverter, Hdf5Reader,
418    Hdf5Writer, ParquetReader, ParquetWriter,
419};