tenflowers_dataset/
lib.rs

1//! # TenfloweRS Dataset
2//!
3//! Efficient data loading, preprocessing, and augmentation for machine learning in TenfloweRS.
4//! This crate provides high-performance data pipelines with support for various formats, transformations,
5//! and distributed loading strategies.
6//!
7//! ## Features
8//!
9//! - **Multiple Data Formats**: CSV, image folders, HDF5, Arrow/Parquet, JSON, and custom formats
10//! - **Efficient Data Loading**: Multi-threaded prefetching, NUMA-aware scheduling, zero-copy operations
11//! - **Rich Transformations**: SIMD-accelerated transforms, GPU preprocessing, composition
12//! - **Advanced Sampling**: Stratified, importance, and distributed sampling strategies
13//! - **Data Quality**: Built-in quality analysis, outlier detection, and drift monitoring
14//! - **Production Features**: Checkpointing, versioning, reproducibility, and debugging tools
15//! - **Streaming Support**: Large dataset streaming with predictive prefetching
16//!
17//! ## Quick Start
18//!
19//! ### Loading Data from CSV
20//!
21//! ```rust,ignore
22//! use tenflowers_dataset::{CsvDataset, CsvDatasetBuilder};
23//!
24//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
25//! // Load CSV data
26//! let dataset = CsvDatasetBuilder::new("data.csv")
27//!     .has_header(true)
28//!     .delimiter(b',')
29//!     .build()?;
30//!
31//! println!("Dataset has {} samples", dataset.len());
32//! # Ok(())
33//! # }
34//! ```
35//!
36//! ### Image Folder Dataset
37//!
38//! ```rust,ignore
39//! use tenflowers_dataset::{ImageFolderDataset, ImageFolderDatasetBuilder};
40//!
41//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
42//! // Load images from directory structure:
43//! // train/
44//! //   cat/
45//! //     img1.jpg
46//! //     img2.jpg
47//! //   dog/
48//! //     img3.jpg
49//! let dataset = ImageFolderDatasetBuilder::new("train/")
50//!     .image_size((224, 224))
51//!     .build()?;
52//!
53//! println!("Found {} images in {} classes", dataset.len(), dataset.num_classes());
54//! # Ok(())
55//! # }
56//! ```
57//!
58//! ### Data Loader with Batching
59//!
60//! ```rust,ignore
61//! use tenflowers_dataset::{DataLoader, DataLoaderBuilder};
62//! use tenflowers_dataset::{CsvDataset, RandomSampler};
63//!
64//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
65//! # let dataset = CsvDataset::new("data.csv")?;
66//! // Create a data loader
67//! let loader = DataLoaderBuilder::new(dataset)
68//!     .batch_size(32)
69//!     .shuffle(true)
70//!     .num_workers(4)
71//!     .prefetch(2)
72//!     .build()?;
73//!
74//! // Iterate through batches
75//! for batch in loader.iter() {
76//!     let (features, labels) = batch?;
77//!     // Training step...
78//! }
79//! # Ok(())
80//! # }
81//! ```
82//!
83//! ### Data Transformations
84//!
85//! ```rust,ignore
86//! use tenflowers_dataset::transforms::{Compose, Normalize, RandomCrop, ToTensor};
87//!
88//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
89//! // Compose multiple transformations
90//! let transform = Compose::new(vec![
91//!     Box::new(RandomCrop::new((224, 224))),
92//!     Box::new(ToTensor),
93//!     Box::new(Normalize::new(vec![0.485, 0.456, 0.406], vec![0.229, 0.224, 0.225])),
94//! ]);
95//! # Ok(())
96//! # }
97//! ```
98//!
99//! ## Advanced Features
100//!
101//! ### Distributed Data Loading
102//!
103//! ```rust,ignore
104//! use tenflowers_dataset::{DataLoaderBuilder, DistributedSampler};
105//! use tenflowers_dataset::CsvDataset;
106//!
107//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
108//! # let dataset = CsvDataset::new("data.csv")?;
109//! // Split dataset across multiple workers
110//! let sampler = DistributedSampler::new(dataset.len(), 4, 0); // 4 workers, rank 0
111//!
112//! let loader = DataLoaderBuilder::new(dataset)
113//!     .sampler(Box::new(sampler))
114//!     .batch_size(32)
115//!     .build()?;
116//! # Ok(())
117//! # }
118//! ```
119//!
120//! ### Data Quality Analysis
121//!
122//! ```rust,ignore
123//! use tenflowers_dataset::{DataQualityAnalyzer, QualityAnalysisConfig};
124//! use tenflowers_dataset::CsvDataset;
125//!
126//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
127//! # let dataset = CsvDataset::new("data.csv")?;
128//! // Analyze data quality
129//! let analyzer = DataQualityAnalyzer::new(QualityAnalysisConfig::default());
130//! let report = analyzer.analyze(&dataset)?;
131//!
132//! println!("Data quality score: {:.2}", report.overall_score());
133//! println!("Issues found: {}", report.num_issues());
134//! # Ok(())
135//! # }
136//! ```
137//!
138//! ### Caching and Prefetching
139//!
140//! ```rust,ignore
141//! use tenflowers_dataset::{EnhancedDataLoaderBuilder, CsvDataset};
142//!
143//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
144//! # let dataset = CsvDataset::new("data.csv")?;
145//! // Use enhanced data loader with smart caching
146//! let loader = EnhancedDataLoaderBuilder::new(dataset)
147//!     .batch_size(64)
148//!     .num_workers(8)
149//!     .enable_caching(true)
150//!     .cache_size_mb(512)
151//!     .adaptive_prefetch(true)
152//!     .build()?;
153//! # Ok(())
154//! # }
155//! ```
156//!
157//! ### Custom Dataset
158//!
159//! ```rust,no_run
160//! use tenflowers_core::{Tensor, Result};
161//! use std::marker::PhantomData;
162//!
163//! struct MyDataset<T> {
164//!     data: Vec<Vec<T>>,
165//!     _phantom: PhantomData<T>,
166//! }
167//!
168//! impl<T: Clone> MyDataset<T> {
169//!     fn len(&self) -> usize {
170//!         self.data.len()
171//!     }
172//!
173//!     fn get(&self, index: usize) -> Option<&Vec<T>> {
174//!         self.data.get(index)
175//!     }
176//! }
177//! ```
178//!
179//! ## Architecture Overview
180//!
181//! The crate is organized into the following modules:
182//!
183//! - [`formats`]: Data format readers (CSV, image, HDF5, Arrow, Parquet)
184//! - [`dataloader`]: Multi-threaded data loading with batching and sampling
185//! - [`transforms`]: Data transformation and augmentation operations
186//! - [`cache`]: Caching strategies for frequently accessed data
187//! - [`distributed_loading`]: Distributed and sharded data loading
188//! - [`data_quality`]: Data quality analysis and validation
189//! - [`statistics`]: Dataset statistics computation
190//! - [`visualization`]: Dataset visualization utilities
191//! - [`reproducibility`]: Reproducibility and versioning support
192//! - [`debug_tools`]: Profiling and debugging utilities
193//!
194//! ## Performance Optimization
195//!
196//! ### SIMD Transformations
197//!
198//! Many transformations use SIMD instructions for maximum performance:
199//!
200//! ```rust,ignore
201//! use tenflowers_dataset::simd_transforms::{SimdNormalize, SimdResize};
202//!
203//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
204//! // SIMD-accelerated normalization
205//! let normalize = SimdNormalize::new(vec![0.5, 0.5, 0.5], vec![0.5, 0.5, 0.5]);
206//! # Ok(())
207//! # }
208//! ```
209//!
210//! ### GPU Preprocessing
211//!
212//! ```rust,ignore
213//! use tenflowers_dataset::gpu_transforms::{GpuResize, GpuNormalize};
214//! use tenflowers_core::Device;
215//!
216//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
217//! # #[cfg(feature = "gpu")]
218//! # {
219//! // Run transformations on GPU
220//! let device = Device::gpu(0)?;
221//! let resize = GpuResize::new((224, 224), &device)?;
222//! # }
223//! # Ok(())
224//! # }
225//! ```
226//!
227//! ### Zero-Copy Operations
228//!
229//! ```rust,ignore
230//! use tenflowers_dataset::zero_copy::{ZeroCopyLoader, MmapDataset};
231//!
232//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
233//! // Memory-mapped dataset for large files
234//! let dataset = MmapDataset::new("large_dataset.bin")?;
235//! # Ok(())
236//! # }
237//! ```
238//!
239//! ## Integration with TenfloweRS Ecosystem
240//!
241//! This crate integrates seamlessly with:
242//! - `tenflowers-core`: Tensor operations and device management
243//! - `tenflowers-neural`: Neural network training pipelines
244//! - `tenflowers-autograd`: Gradient-based transformations
245//! - `scirs2-core`: Scientific computing utilities
246//!
247//! ## Supported Data Formats
248//!
249//! - **CSV**: Comma-separated values with customizable delimiters
250//! - **Images**: JPEG, PNG, BMP, TIFF via image folder structure
251//! - **HDF5**: Hierarchical data format for scientific data
252//! - **Arrow/Parquet**: Columnar data formats for analytics
253//! - **JSON**: Structured JSON data
254//! - **Custom**: Extensible format registry for custom formats
255//!
256//! ## Debugging and Profiling
257//!
258//! ```rust,ignore
259//! use tenflowers_dataset::{DatasetDebugger, PipelineProfiler};
260//! use tenflowers_dataset::CsvDataset;
261//!
262//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
263//! # let dataset = CsvDataset::new("data.csv")?;
264//! // Profile data loading pipeline
265//! let profiler = PipelineProfiler::new();
266//! profiler.start();
267//!
268//! // ... load data ...
269//!
270//! let report = profiler.generate_report();
271//! println!("Bottlenecks: {:?}", report.bottlenecks());
272//! # Ok(())
273//! # }
274//! ```
275//!
276//! ## Pipeline Inspection
277//!
278//! [`InspectablePipeline`] instruments each transform step to record per-step latency,
279//! input/output shapes, and error rates:
280//!
281//! ```rust,ignore
282//! use tenflowers_dataset::{InspectablePipeline};
283//!
284//! # fn main() {
285//! let mut pipeline = InspectablePipeline::new();
286//! // pipeline.add_step("norm", Box::new(my_transform));
287//! // let report = pipeline.run_inspection_batch(&dataset, 100);
288//! // println!("avg latency: {} μs", report.avg_latency_per_step_micros());
289//! # }
290//! ```
291//!
292//! ## Data Drift Metrics
293//!
294//! Three statistical drift measures are available as free functions:
295//!
296//! - [`population_stability_index`]: PSI over equal-width bins; < 0.1 stable, > 0.2 significant.
297//! - [`ks_two_sample`]: Kolmogorov-Smirnov max |ECDF_a − ECDF_b|, range [0, 1].
298//! - [`jensen_shannon_divergence`]: Symmetric KL-based divergence, range [0, 1].
299//! - [`compute_drift`]: Convenience wrapper returning a [`DriftReport`] with all three.
300//!
301//! ```rust,no_run
302//! use tenflowers_dataset::compute_drift;
303//!
304//! let reference: Vec<f64> = (0..100).map(|i| i as f64).collect();
305//! let current: Vec<f64> = (0..100).map(|i| i as f64 + 50.0).collect();
306//! let report = compute_drift(&reference, &current).expect("drift computation failed");
307//! println!("PSI: {:.4}, KS: {:.4}, significant: {}", report.psi, report.ks_statistic, report.is_significant_drift);
308//! ```
309//!
310//! ## Adaptive Prefetch PID Controller
311//!
312//! [`PidAdaptiveController`] adjusts prefetch depth based on cache-hit rate telemetry
313//! using a classic PID algorithm with anti-windup integral clamping:
314//!
315//! ```rust,no_run
316//! use tenflowers_dataset::PidAdaptiveController;
317//!
318//! let mut ctrl = PidAdaptiveController::new(0.5, 0.05, 0.01, 0.80, 4, 1, 32);
319//! let new_depth = ctrl.tick(0.65); // below setpoint → depth increases
320//! println!("Recommended prefetch depth: {}", new_depth);
321//! ```
322//!
323//! ## Schema Validation
324//!
325//! [`SchemaValidator::validate_full`] returns a [`SchemaValidationReport`] with structured
326//! [`FieldDiff`] entries for every field:
327//!
328//! ```rust,ignore
329//! use tenflowers_dataset::{SchemaValidator, FieldDiff};
330//!
331//! # fn main() {
332//! let validator = SchemaValidator::lenient(); // widening allowed
333//! // let report = validator.validate_full(&actual_metadata, &expected_fields);
334//! // for (name, diff) in &report.diffs { println!("{}: {:?}", name, diff); }
335//! # }
336//! ```
337//!
338//! ## Quick Start (Runnable Doctest)
339//!
340//! The following example builds a tiny in-memory dataset and verifies all samples
341//! can be retrieved — no file I/O or external dependencies required:
342//!
343//! ```rust
344//! use tenflowers_dataset::{Dataset, TensorDataset};
345//! use tenflowers_core::Tensor;
346//!
347//! let features = Tensor::<f32>::from_vec(
348//!     vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
349//!     &[4, 2],
350//! ).expect("tensor creation failed");
351//! let labels = Tensor::<f32>::from_vec(vec![0.0, 1.0, 0.0, 1.0], &[4])
352//!     .expect("tensor creation failed");
353//!
354//! let dataset = TensorDataset::new(features, labels);
355//! assert_eq!(dataset.len(), 4);
356//!
357//! for i in 0..dataset.len() {
358//!     let (feat, lbl) = dataset.get(i).expect("get should succeed");
359//!     assert_eq!(feat.shape().dims()[0], 2);
360//!     assert_eq!(lbl.shape().size(), 1);
361//! }
362//! ```
363
364#![warn(unsafe_code)]
365#![allow(unexpected_cfgs)]
366#![allow(clippy::result_large_err)]
367#![allow(clippy::too_many_arguments)]
368#![allow(clippy::let_and_return)]
369#![allow(clippy::should_implement_trait)]
370#![allow(clippy::erasing_op)]
371#![allow(clippy::identity_op)]
372#![allow(unused_imports)]
373#![allow(unused_variables)]
374#![allow(unused_mut)]
375#![allow(dead_code)]
376#![allow(clippy::clone_on_copy)]
377#![allow(clippy::multiple_bound_locations)]
378#![allow(clippy::iter_cloned_collect)]
379#![allow(clippy::collapsible_else_if)]
380#![allow(clippy::type_complexity)]
381#![allow(clippy::borrowed_box)]
382#![allow(clippy::derivable_impls)]
383
384// Re-export core types needed by sub-modules via `crate::Result` / `crate::TensorError`
385pub use tenflowers_core::{Result, TensorError};
386
387pub mod active_learning;
388pub mod adaptive_prefetch;
389pub mod advanced_benchmarks;
390pub mod advanced_sampling;
391pub mod attention_optimized;
392pub mod benchmarks;
393pub mod cache;
394pub mod config;
395pub mod data_quality;
396pub mod dataloader;
397pub mod dataset_core;
398pub mod debug_tools;
399pub mod distributed_loading;
400pub mod distributed_sharding;
401pub mod distributed_streaming;
402pub mod enhanced_dataloader;
403pub mod error_taxonomy;
404pub mod federated;
405pub mod formats;
406pub mod gpu_transforms;
407pub mod memory_pool;
408pub mod multimodal;
409pub mod numa_scheduler;
410pub mod online_learning;
411pub mod predictive_prefetch;
412#[cfg(feature = "download")]
413pub mod real_datasets;
414pub mod reproducibility;
415pub mod schema_inference;
416pub mod simd_transforms;
417pub mod smart_cache;
418pub mod statistics;
419pub mod stream_prefetch_optimizer;
420pub mod streaming_optimized;
421pub mod synthetic;
422pub mod throughput_benchmark;
423pub mod transforms;
424pub mod validation;
425pub mod versioning;
426pub mod visualization;
427pub mod work_stealing;
428pub mod zero_copy;
429
430pub use data_quality::{
431    compute_drift, jensen_shannon_divergence, ks_two_sample, population_stability_index,
432    DataQualityAnalyzer, DataQualityExt, DataQualityIssue, DataQualityMetrics,
433    DriftDetectionConfig, DriftDetectionResult, DriftReport, DriftType, IssueCategory,
434    IssueSeverity, OutlierDetectionMethod, QualityAnalysisConfig, StatisticalTest,
435};
436pub use dataloader::{
437    BatchResult, BucketCollate, CollateFn, DataLoader, DataLoaderBuilder, DataLoaderConfig,
438    DefaultCollate, DistributedSampler, ImportanceSampler, PaddingCollate, PaddingStrategy,
439    RandomSampler, Sampler, SequentialSampler, StratifiedSampler,
440};
441pub use debug_tools::{
442    Bottleneck, BottleneckCategory, ConsistencyReport, DatasetDebugger, EventType,
443    InspectablePipeline, InspectionEvent, PipelineInspectionReport, PipelineProfiler, ProfileEvent,
444    ProfileReport, ProfilerConfig, SampleInfo as DebugSampleInfo, Severity, StageStatistics,
445    StageTimer,
446};
447pub use enhanced_dataloader::{
448    EnhancedDataLoader, EnhancedDataLoaderBuilder, LoaderStats, WorkerStats,
449};
450pub use error_taxonomy::{
451    classification, helpers as error_helpers, DatasetErrorBuilder, DatasetErrorCategory,
452    DatasetErrorContext,
453};
454pub use formats::common::{MissingValueStrategy, NamingPattern};
455pub use formats::csv::{ChunkedCsvDataset, CsvChunk, CsvDataset, CsvDatasetBuilder};
456pub use formats::image::{
457    image_folder_dataset_with_transform, ImageFolderConfig, ImageFolderDataset,
458    ImageFolderDatasetBuilder,
459};
460pub use formats::registry::{
461    global as format_registry, register_format_factory, FormatInfo, GlobalFormatRegistry,
462};
463pub use formats::schema_validator::{
464    FieldDiff, SchemaValidator, ValidationPolicy, ValidationReport as SchemaValidationReport,
465};
466pub use transforms::{
467    AddNoise, BackgroundNoise, DatasetExt, GaussianNoise, GlobalNormalize, MinMaxScale, NoiseType,
468    Normalize, PerChannelNormalize, RealTimeAudioAugmentation, RobustScaler, Transform,
469    TransformedDataset,
470};
471// Additional format modules:
472#[cfg(feature = "serialize")]
473pub use formats::json::{
474    JsonConfig, JsonDataset, JsonDatasetBuilder, JsonDatasetInfo, JsonLDataset,
475};
476// #[cfg(feature = "mmap")]
477// pub use formats::mmap::{MmapDataset, MmapMemoryInfo};
478pub use formats::text::{
479    LabelStrategy, TextConfig, TextDataset, TextDatasetBuilder, TextDatasetInfo,
480    TokenizationStrategy, TokenizedDataset, Vocabulary,
481};
482// #[cfg(feature = "parquet")]
483// pub use formats::streaming::StreamingCheckpoint;
484pub use active_learning::{
485    ActiveLearningDataset, ActiveLearningSampler, DiversityStrategy, LabeledSubset,
486    UncertaintyStrategy, UnlabeledSubset,
487};
488pub use adaptive_prefetch::{
489    AdaptationStrategy, AdaptivePrefetchPolicy, AdaptivePrefetchTuner, PidAdaptiveController,
490    PrefetchMetrics as AdaptivePrefetchMetrics, TuningDecision,
491};
492pub use advanced_benchmarks::{
493    AdvancedBenchmarkSuite, BenchmarkConfig, BenchmarkResult, CpuStats, GpuStats, MemoryStats,
494    MemoryTracker as BenchmarkMemoryTracker, SystemInfo, ThroughputStats, TimingStats,
495};
496pub use advanced_sampling::{
497    AdvancedImportanceSampler, BalancingStrategy, ClassBalancedSampler, CurriculumScheduler,
498    CurriculumStrategy, HardNegativeMiner, MiningStrategy,
499};
500pub use attention_optimized::{
501    AttentionOptimizedConfig, AttentionOptimizedDataset, AttentionOptimizedDatasetBuilder,
502    AttentionPattern, AttentionSequence, SequenceMetadata as AttentionSequenceMetadata,
503};
504pub use benchmarks::{BenchmarkDatasets, CifarDataset, DatasetInfo, IrisDataset, MnistDataset};
505pub use cache::{
506    AggregatedStats, AlertSeverity, AlertThresholds, AlertType, CacheEvent, CacheEventType,
507    CacheExt, CacheStats, CacheTelemetryCollector, CacheTelemetryMetrics, CachedDataset,
508    EnhancedTelemetryCollector, LruCache, MetricsSnapshot, PerformanceAlert, PerformanceBaselines,
509    TelemetryConfig, ThreadSafeLruCache, WarmingStrategy,
510};
511#[cfg(feature = "serialize")]
512pub use cache::{PersistentCache, PersistentlyCachedDataset, TensorPersistentCache};
513pub use distributed_loading::{
514    create_distributed_dataloader, CollectiveOpType, CommunicationManager,
515    DistributedLoadingConfig, DistributedLoadingStats, DistributedMessage,
516    EnhancedDistributedSampler, NodeInfo,
517};
518pub use distributed_sharding::{
519    DatasetShardingExt, ShardConfig, ShardStatistics, ShardStrategy, ShardableDataset,
520    ShardedDataset,
521};
522pub use distributed_streaming::{
523    CheckpointState, PartitionStrategy, StreamCoordinator, StreamingConfig, StreamingShardIterator,
524    StreamingShardLoader, StreamingStats, WorkerHealth, WorkerMetrics, WorkerStatus,
525};
526pub use federated::{
527    AggregationStrategy, ClientConfig, ClientId, ClientIndexedDataset, ClientStats,
528    DataDistribution, FederatedAggregator, FederatedClientDataset, FederatedDatasetExt,
529    FederatedFeatureStats, FederatedPartitioner, NoiseMechanism, PartitioningStrategy,
530    PrivacyConfig, PrivacyManager, PrivateStats, QualityMetrics,
531};
532#[cfg(feature = "parquet")]
533pub use formats::arrow::{
534    ArrowArrayExt, ArrowConfig, ArrowDataset, ArrowDatasetBuilder, ArrowFormatFactory,
535    ArrowFormatReader, ArrowTensorView,
536};
537#[cfg(feature = "audio")]
538pub use formats::audio::{
539    AudioConfig, AudioDataset, AudioDatasetBuilder, AudioDatasetInfo, AudioInfo,
540    AudioLabelStrategy, FeatureType as AudioFeatureType,
541};
542#[cfg(feature = "hdf5")]
543pub use formats::hdf5::{HDF5Config, HDF5Dataset, HDF5DatasetBuilder, HDF5DatasetInfo};
544#[cfg(feature = "parquet")]
545pub use formats::parquet::{
546    ParquetConfig, ParquetDataset, ParquetDatasetBuilder, ParquetDatasetInfo,
547};
548#[cfg(feature = "tfrecord")]
549pub use formats::tfrecord::{
550    Feature, FeatureInfo, FeatureType, TFRecord, TFRecordConfig, TFRecordDataset,
551    TFRecordDatasetBuilder, TFRecordDatasetInfo,
552};
553#[cfg(feature = "webdataset")]
554pub use formats::webdataset::{
555    StreamingWebDataset, WebDataset, WebDatasetBuilder, WebDatasetConfig, WebDatasetSample,
556};
557pub use formats::zarr::{
558    ZarrArrayInfo, ZarrCompressionType, ZarrConfig, ZarrDataset, ZarrDatasetBuilder, ZarrDatasetExt,
559};
560
561#[cfg(feature = "cloud")]
562pub use formats::zarr::CloudBackend;
563pub use gpu_transforms::{
564    GpuColorJitter, GpuContext, GpuGaussianBlur, GpuGaussianNoise, GpuRandomCrop,
565    GpuRandomHorizontalFlip, GpuResize, GpuRotation,
566};
567pub use memory_pool::{GlobalMemoryPool, MemoryPool, MemoryPoolExt, PoolStats, PooledMemory};
568pub use multimodal::{
569    FusionStrategy, Modality, MultimodalConfig, MultimodalDataset, MultimodalDatasetBuilder,
570    MultimodalSample, MultimodalTransform, MultimodalTransformedDataset,
571};
572pub use numa_scheduler::{
573    NumaAssignmentStats, NumaAssignmentStrategy, NumaConfig, NumaNode, NumaScheduler, NumaTopology,
574    NumaWorkerAssignment,
575};
576pub use online_learning::{
577    ADWINDetector, DriftDetectionMethod, DriftDetector, ErrorRateDetector, KSDetector,
578    OnlineLearningConfig, OnlineLearningDataset, OnlineStats, PageHinkleyDetector,
579};
580pub use predictive_prefetch::{
581    AccessPattern, AccessStats, PredictivePrefetchDataset, PredictivePrefetcher, PrefetchConfig,
582};
583#[cfg(feature = "download")]
584pub use real_datasets::{
585    AgNewsConfig, Cifar10Config, ImageNetConfig, ImdbConfig, MnistConfig, RealAgNewsBuilder,
586    RealAgNewsDataset, RealCifar10Builder, RealCifar10Dataset, RealImageNetBuilder,
587    RealImageNetDataset, RealImdbBuilder, RealImdbDataset, RealMnistBuilder, RealMnistDataset,
588};
589pub use reproducibility::{
590    DatasetConfig, DeterministicDataset, DeterministicOps, DeterministicOrdering, EnvironmentInfo,
591    ExperimentConfig, ExperimentTracker, OperationRecord, OrderingStrategy, ReproducibilityExt,
592    SamplingConfig, SeedInfo, SeedManager, TransformConfig,
593};
594pub use schema_inference::{
595    FieldStatistics, InferenceConfig, InferredDataType, InferredField, InferredSchema,
596    SchemaInferenceEngine,
597};
598pub use simd_transforms::{
599    BenchmarkResult as SimdBenchmarkResult, SimdBenchmark, SimdColorConvert, SimdConvolution,
600    SimdElementWise, SimdHistogram, SimdHistogramTransform, SimdMatrixOps, SimdNormalize,
601    SimdOperation, SimdStats,
602};
603pub use smart_cache::{
604    AccessPatternPredictor, CacheConfig, CacheLevel, EvictionPolicy, PredictiveSmartCache,
605    SmartCache, SmartCachedDataset,
606};
607pub use statistics::{
608    AdvancedStatistics, AdvancedStatisticsExt, CorrelationAnalyzer, DatasetStatisticsComputer,
609    DatasetStatisticsExt, DatasetStats, Histogram, MultivariateStatistics, PCAResult,
610    StatisticsConfig,
611};
612pub use stream_prefetch_optimizer::{
613    AccessEvent, AccessPatternAnalyzer, AccessType, PatternPrediction, PatternSignature,
614    PrefetchMetrics, PrefetchOptimizerConfig, StreamPrefetchOptimizer,
615};
616pub use streaming_optimized::{
617    AdaptiveBuffer, CompressionType, StreamingOptimizedConfig, StreamingOptimizedDataset,
618    StreamingOptimizedDatasetBuilder, StreamingOptimizedIterator,
619    StreamingStats as OptimizedStreamingStats,
620};
621pub use synthetic::{
622    ContrastiveLearningDataset, DatasetGenerator, Episode, FewShotDataset, GeometricShape,
623    GradientDirection, ImagePatternConfig, ImagePatternGenerator, ImagePatternType,
624    MetaLearningDataset, ModernMLConfig, NoiseDistribution, SelfSupervisedDataset,
625    StripeOrientation, SyntheticConfig, SyntheticDataset, SyntheticTextCorpus, TaskDataset,
626    TextCorpusConfig, TextSynthesisTask, TimeSeriesPattern,
627};
628pub use throughput_benchmark::{
629    MemoryStats as ThroughputMemoryStats, ThreadStats as ThroughputThreadStats,
630    ThroughputBenchmarkConfig, ThroughputBenchmarkHarness, ThroughputBenchmarkResult,
631};
632pub use validation::{
633    DataValidator, DatasetValidationExt, RangeConstraint, SchemaInfo, ValidationConfig,
634    ValidationResult,
635};
636pub use versioning::{
637    DatasetLineage, DatasetSizeInfo, DatasetVersionManager, LineageTree, TransformationRecord,
638    VersionId, VersionMetadata, VersionedDataset,
639};
640pub use visualization::{
641    ClassDistribution, DatasetVisualizationExt, DatasetVisualizer, DistributionInfo,
642    FeatureHistogram, FeatureStats, SampleInfo, SamplePreview,
643};
644pub use work_stealing::WorkStealingQueue;
645pub use zero_copy::{MemoryMappedDataset, TensorView, ZeroCopyDataset};
646
647#[cfg(feature = "mmap")]
648pub use zero_copy::{MemoryMappedFileDataset, MemoryMappedFileStats};
649
650// Core dataset traits and types extracted to dataset_core module.
651pub use dataset_core::{
652    BatchedDataset, ConcatDataset, Dataset, DatasetSplit, DatasetSplitter, DatasetUtilsExt,
653    FilteredDataset, MergeStrategy, MergedDataset, SubsetDataset, TensorDataset,
654};
tenflowers_dataset/lib.rs

tenflowers_dataset/
lib.rs