scirs2_datasets/
lib.rs

1#![allow(deprecated)]
2//! Datasets module for SciRS2
3//!
4//! This module provides dataset loading utilities similar to scikit-learn's datasets module.
5//! It includes toy datasets, sample datasets, time series datasets, data generators,
6//! and utilities for loading and processing datasets.
7//!
8//! # Features
9//!
10//! - **Toy datasets**: Classic datasets like Iris, Boston Housing, Breast Cancer, and Digits
11//! - **Data generators**: Create synthetic datasets for classification, regression, clustering, and time series
12//! - **Cross-validation utilities**: K-fold, stratified, and time series cross-validation
13//! - **Dataset utilities**: Train/test splitting, normalization, and metadata handling
14//! - **Caching**: Efficient caching system for downloaded datasets
15//! - **Registry**: Centralized registry for dataset metadata and locations
16//!
17//! # Examples
18//!
19//! ## Loading toy datasets
20//!
21//! ```rust
22//! use scirs2__datasets::{load_iris, load_boston};
23//!
24//! // Load the classic Iris dataset
25//! let iris = load_iris().unwrap();
26//! println!("Iris dataset: {} samples, {} features", iris.n_samples(), iris.n_features());
27//!
28//! // Load the Boston housing dataset
29//! let boston = load_boston().unwrap();
30//! println!("Boston dataset: {} samples, {} features", boston.n_samples(), boston.n_features());
31//! ```
32//!
33//! ## Generating synthetic datasets
34//!
35//! ```rust
36//! use scirs2__datasets::{make_classification, make_regression, make_blobs, make_spirals, make_moons};
37//!
38//! // Generate a classification dataset
39//! let classification = make_classification(100, 5, 3, 2, 4, Some(42)).unwrap();
40//! println!("Classification dataset: {} samples, {} features, {} classes",
41//!          classification.n_samples(), classification.n_features(), 3);
42//!
43//! // Generate a regression dataset
44//! let regression = make_regression(50, 4, 3, 0.1, Some(42)).unwrap();
45//! println!("Regression dataset: {} samples, {} features",
46//!          regression.n_samples(), regression.n_features());
47//!
48//! // Generate a clustering dataset
49//! let blobs = make_blobs(80, 3, 4, 1.0, Some(42)).unwrap();
50//! println!("Blobs dataset: {} samples, {} features, {} clusters",
51//!          blobs.n_samples(), blobs.n_features(), 4);
52//!
53//! // Generate non-linear patterns
54//! let spirals = make_spirals(200, 2, 0.1, Some(42)).unwrap();
55//! let moons = make_moons(150, 0.05, Some(42)).unwrap();
56//! ```
57//!
58//! ## Cross-validation
59//!
60//! ```rust
61//! use scirs2__datasets::{load_iris, k_fold_split, stratified_k_fold_split};
62//!
63//! let iris = load_iris().unwrap();
64//!
65//! // K-fold cross-validation
66//! let k_folds = k_fold_split(iris.n_samples(), 5, true, Some(42)).unwrap();
67//! println!("Created {} folds for K-fold CV", k_folds.len());
68//!
69//! // Stratified K-fold cross-validation
70//! if let Some(target) = &iris.target {
71//!     let stratified_folds = stratified_k_fold_split(target, 5, true, Some(42)).unwrap();
72//!     println!("Created {} stratified folds", stratified_folds.len());
73//! }
74//! ```
75//!
76//! ## Dataset manipulation
77//!
78//! ```rust
79//! use scirs2__datasets::{load_iris, Dataset};
80//!
81//! let iris = load_iris().unwrap();
82//!
83//! // Access dataset properties
84//! println!("Dataset: {} samples, {} features", iris.n_samples(), iris.n_features());
85//! if let Some(featurenames) = iris.featurenames() {
86//!     println!("Features: {:?}", featurenames);
87//! }
88//! ```
89
90#![warn(missing_docs)]
91
92pub mod advanced_generators;
93pub mod benchmarks;
94pub mod cache;
95pub mod cloud;
96pub mod distributed;
97pub mod domain_specific;
98pub mod error;
99pub mod explore;
100pub mod external;
101pub mod generators;
102pub mod gpu;
103pub mod gpu_optimization;
104pub mod loaders;
105pub mod ml_integration;
106pub mod real_world;
107pub mod registry;
108pub mod sample;
109pub mod streaming;
110pub mod time_series;
111pub mod toy;
112/// Core utilities for working with datasets
113///
114/// This module provides the Dataset struct and helper functions for
115/// manipulating and transforming datasets.
116pub mod utils;
117
118/// API stability guarantees and compatibility documentation
119///
120/// This module defines the API stability levels and compatibility guarantees
121/// for the scirs2-datasets crate.
122pub mod stability;
123
124// Temporary module to test method resolution conflict
125mod method_resolution_test;
126
127pub mod adaptive_streaming_engine;
128pub mod neuromorphic_data_processor;
129pub mod quantum_enhanced_generators;
130pub mod quantum_neuromorphic_fusion;
131
132// Re-export commonly used functionality
133pub use adaptive_streaming_engine::{
134    create_adaptive_engine, create_adaptive_engine_with_config, AdaptiveStreamConfig,
135    AdaptiveStreamingEngine, AlertSeverity, AlertType, ChunkMetadata, DataCharacteristics,
136    MemoryStrategy, PatternType, PerformanceMetrics, QualityAlert, QualityMetrics,
137    StatisticalMoments, StreamChunk, TrendDirection, TrendIndicators,
138};
139pub use advanced_generators::{
140    make_adversarial_examples, make_anomaly_dataset, make_continual_learning_dataset,
141    make_domain_adaptation_dataset, make_few_shot_dataset, make_multitask_dataset,
142    AdversarialConfig, AnomalyConfig, AnomalyType, AttackMethod, ContinualLearningDataset,
143    DomainAdaptationConfig, DomainAdaptationDataset, FewShotDataset, MultiTaskConfig,
144    MultiTaskDataset, TaskType,
145};
146pub use benchmarks::{BenchmarkResult, BenchmarkRunner, BenchmarkSuite, PerformanceComparison};
147pub use cloud::{
148    presets::{azure_client, gcs_client, public_s3_client, s3_client, s3_compatible_client},
149    public_datasets::{AWSOpenData, AzureOpenData, GCPPublicData},
150    CloudClient, CloudConfig, CloudCredentials, CloudProvider,
151};
152pub use distributed::{DistributedConfig, DistributedProcessor, ScalingMethod, ScalingParameters};
153pub use domain_specific::{
154    astronomy::StellarDatasets,
155    climate::ClimateDatasets,
156    convenience::{
157        list_domain_datasets, load_atmospheric_chemistry, load_climate_data, load_exoplanets,
158        load_gene_expression, load_stellar_classification,
159    },
160    genomics::GenomicsDatasets,
161    DomainConfig, QualityFilters,
162};
163pub use explore::{
164    convenience::{explore, export_summary, info, quick_summary},
165    DatasetExplorer, DatasetSummary, ExploreConfig, FeatureStatistics, InferredDataType,
166    OutputFormat, QualityAssessment,
167};
168#[cfg(not(feature = "download"))]
169pub use external::convenience::{load_github_dataset_sync, load_uci_dataset_sync};
170pub use external::{
171    convenience::{list_uci_datasets, load_from_url_sync},
172    repositories::{GitHubRepository, KaggleRepository, UCIRepository},
173    ExternalClient, ExternalConfig, ProgressCallback,
174};
175pub use ml_integration::{
176    convenience::{create_experiment, cv_split, prepare_for_ml, train_test_split},
177    CrossValidationResults, DataSplit, MLExperiment, MLPipeline, MLPipelineConfig,
178    ScalingMethod as MLScalingMethod,
179};
180
181pub use cache::{
182    get_cachedir, BatchOperations, BatchResult, CacheFileInfo, CacheManager, CacheStats,
183    DatasetCache, DetailedCacheStats,
184};
185#[cfg(feature = "download")]
186pub use external::convenience::{load_from_url, load_github_dataset, load_uci_dataset};
187pub use generators::{
188    add_time_series_noise, benchmark_gpu_vs_cpu, get_gpu_info, gpu_is_available,
189    inject_missing_data, inject_outliers, make_anisotropic_blobs, make_blobs, make_blobs_gpu,
190    make_circles, make_classification, make_classification_gpu, make_corrupted_dataset, make_helix,
191    make_hierarchical_clusters, make_intersecting_manifolds, make_manifold, make_moons,
192    make_regression, make_regression_gpu, make_s_curve, make_severed_sphere, make_spirals,
193    make_swiss_roll, make_swiss_roll_advanced, make_time_series, make_torus, make_twin_peaks,
194    ManifoldConfig, ManifoldType, MissingPattern, OutlierType,
195};
196pub use gpu::{
197    get_optimal_gpu_config, is_cuda_available, is_opencl_available, list_gpu_devices,
198    make_blobs_auto_gpu, make_classification_auto_gpu, make_regression_auto_gpu, GpuBackend,
199    GpuBenchmark, GpuBenchmarkResults, GpuConfig, GpuContext, GpuDeviceInfo, GpuMemoryConfig,
200};
201pub use gpu_optimization::{
202    benchmark_advanced_performance, generate_advanced_matrix, AdvancedGpuOptimizer,
203    AdvancedKernelConfig, BenchmarkResult as AdvancedBenchmarkResult, DataLayout,
204    LoadBalancingMethod, MemoryAccessPattern, PerformanceBenchmarkResults, SpecializationLevel,
205    VectorizationStrategy,
206};
207pub use loaders::{
208    load_csv, load_csv_legacy, load_csv_parallel, load_csv_streaming, load_json, load_raw,
209    save_json, CsvConfig, DatasetChunkIterator, StreamingConfig,
210};
211pub use neuromorphic_data_processor::{
212    create_neuromorphic_processor, create_neuromorphic_processor_with_topology, NetworkTopology,
213    NeuromorphicProcessor, NeuromorphicTransform, SynapticPlasticity,
214};
215pub use quantum_enhanced_generators::{
216    make_quantum_blobs, make_quantum_classification, make_quantum_regression,
217    QuantumDatasetGenerator,
218};
219pub use quantum_neuromorphic_fusion::{
220    create_fusion_with_params, create_quantum_neuromorphic_fusion, QuantumBioFusionResult,
221    QuantumInterference, QuantumNeuromorphicFusion,
222};
223pub use real_world::{
224    list_real_world_datasets, load_adult, load_california_housing, load_heart_disease,
225    load_red_wine_quality, load_titanic, RealWorldConfig, RealWorldDatasets,
226};
227pub use registry::{get_registry, load_dataset_byname, DatasetMetadata, DatasetRegistry};
228pub use sample::*;
229pub use streaming::{
230    stream_classification, stream_csv, stream_regression, DataChunk, StreamConfig, StreamProcessor,
231    StreamStats, StreamTransformer, StreamingIterator,
232};
233pub use toy::*;
234pub use utils::{
235    analyze_dataset_advanced, create_balanced_dataset, create_binned_features,
236    generate_synthetic_samples, importance_sample, k_fold_split, min_max_scale,
237    polynomial_features, quick_quality_assessment, random_oversample, random_sample,
238    random_undersample, robust_scale, statistical_features, stratified_k_fold_split,
239    stratified_sample, time_series_split, AdvancedDatasetAnalyzer, AdvancedQualityMetrics,
240    BalancingStrategy, BinningStrategy, CorrelationInsights, CrossValidationFolds, Dataset,
241    NormalityAssessment,
242};