Skip to main content

torsh_data/
lib.rs

1//! Data loading and preprocessing utilities for ToRSh
2//!
3//! This crate provides PyTorch-compatible data loading functionality,
4//! including datasets, data loaders, and common transformations.
5
6#![cfg_attr(not(feature = "std"), no_std)]
7
8#[cfg(not(feature = "std"))]
9extern crate alloc;
10
11pub mod builtin;
12pub mod collate;
13pub mod dataloader;
14pub mod dataset;
15pub mod error;
16pub mod sampler;
17pub mod transforms;
18pub mod utils;
19
20// NOTE: Async dataloader support is planned for future releases
21// #[cfg(feature = "async-support")]
22// pub use dataloader::async_dataloader::{
23//     async_dataloader, AsyncDataLoader, AsyncDataLoaderBuilder, AsyncDataLoaderStream,
24// };
25
26// #[cfg(feature = "async-support")]
27// pub mod async_utils {
28//     pub use crate::dataloader::async_dataloader::async_utils::*;
29// }
30
31#[cfg(feature = "gpu-acceleration")]
32pub mod gpu_acceleration;
33
34// #[cfg(feature = "arrow")]
35pub mod arrow_integration;
36
37#[cfg(feature = "hdf5-support")]
38pub mod hdf5_integration;
39
40#[cfg(feature = "parquet-support")]
41pub mod parquet_integration;
42
43pub mod tfrecord_integration;
44
45pub mod database_integration;
46
47#[cfg(feature = "image-support")]
48pub mod vision;
49
50#[cfg(feature = "image-support")]
51pub use vision::{
52    Compose, ImageFolder, ImageNet, ImageToTensor, RandomHorizontalFlip, RandomRotation,
53    RandomVerticalFlip, TensorToImage, TensorToVideo, VideoFolder, VideoFrames, VideoToTensor,
54    CIFAR10, MNIST,
55};
56
57#[cfg(feature = "dataframe")]
58pub mod tabular;
59
60#[cfg(feature = "audio-support")]
61pub mod audio;
62
63pub mod augmentation_pipeline;
64pub mod core_framework;
65pub mod online_transforms;
66pub mod tensor_transforms;
67pub mod text;
68pub mod text_processing;
69pub mod zero_copy;
70
71#[cfg(feature = "privacy")]
72pub mod privacy;
73
74#[cfg(feature = "federated")]
75pub mod federated;
76
77#[cfg(all(target_arch = "wasm32", feature = "wasm"))]
78pub mod wasm;
79
80pub use collate::{
81    collate_fn, AdaptiveBatchSampler, BucketBatchSampler, CachedCollate, Collate,
82    DynamicBatchCollate,
83};
84
85#[cfg(feature = "std")]
86pub use collate::{optimized_collate_fn, OptimizedCollate};
87
88#[cfg(feature = "sparse")]
89pub use collate::{MixedCollate, SparseCollate};
90pub use dataloader::{simple_random_dataloader, DataLoader, DataLoaderBuilder, DataLoaderTrait};
91pub use dataset::{
92    dataset_statistics, random_split, stratified_split, BufferedStreamingDataset, CachedDataset,
93    ChainDataset, ConcatDataset, DataPipeline, Dataset, DatasetToStreaming, FeatureStats,
94    InfiniteDataset, IterableDataset, KFold, PipelineStreamingDataset, RealTimeDataset,
95    StreamingDataset, Subset, TensorDataset,
96};
97
98#[cfg(feature = "std")]
99pub use dataset::{DatasetProfileStats, DatasetProfiler, ProfiledDataset, SharedMemoryDataset};
100
101#[cfg(all(feature = "std", feature = "mmap-support"))]
102pub use dataset::MemoryMappedDataset;
103pub use sampler::{
104    AcquisitionStrategy, ActiveLearningSampler, AdaptiveSampler, AdaptiveStrategy, BatchSampler,
105    BatchingSampler, CurriculumSampler, CurriculumStrategy, DistributedSampler, GroupedSampler,
106    ImportanceSampler, RandomSampler, Sampler, SequentialSampler, StratifiedSampler,
107    SubsetRandomSampler, WeightedRandomSampler,
108};
109
110#[cfg(feature = "privacy")]
111pub use privacy::{
112    dp_utils, CompositionType, DPMechanism, GaussianNoise, LaplaceNoise, NoiseGenerator,
113    PrivacyBudget, PrivacyBuilder, PrivateDataset, PrivateSampler,
114};
115
116#[cfg(feature = "federated")]
117pub use federated::{
118    federated_utils, AggregationStrategy, ClientCapabilities, ClientId, ClientInfo,
119    ClientSelectionStrategy, ComputePower, FederatedConfig, FederatedDataset,
120    FederatedDatasetBuilder, FederatedSampler, NetworkBandwidth,
121};
122
123#[cfg(all(target_arch = "wasm32", feature = "wasm"))]
124pub use wasm::{optimization, wasm_utils, StreamingDataset, WasmDataLoader, WasmDataset};
125
126pub use text::{
127    TextClassificationDataset, TextFileDataset, TextSequence, TokenIdsToTensor, Vocabulary,
128};
129
130pub use error::{
131    diagnostics, patterns, recovery, BatchInfo, CollationErrorKind, ConfigErrorKind, DataError,
132    DataLoaderErrorKind, DatasetErrorKind, ErrorContext, ErrorSeverity, IoErrorKind,
133    ResourceErrorKind, Result, SamplerErrorKind, TransformErrorKind, WithContext,
134};
135
136pub use transforms::{
137    // NOTE: Selective exports maintained for API stability
138    // augmentation_pipeline,
139    // // Specialized modules
140    // core_framework,
141    // lambda,
142    // normalize,
143    // online_transforms,
144    // tensor_transforms,
145    // text_processing,
146    // to_type,
147    // zero_copy,
148    Chain,
149    Compose as TransformCompose,
150    Conditional,
151    // Common types
152    Lambda,
153    Normalize,
154    ToType,
155    // Core framework
156    Transform,
157    TransformBuilder,
158    TransformExt,
159};
160
161pub use utils::{
162    batch, concurrent, create_size_tuple, errors, memory, performance, validate_dataset_path,
163    validate_file_extension, validate_not_empty, validate_positive, validate_probability,
164    validate_range, validate_same_length, validate_tensor_shape, Cacheable, Configurable,
165    ProgressTracker, Resettable,
166};
167
168pub use builtin::{
169    load_builtin_dataset, make_blobs, make_classification, make_regression, BuiltinDataset,
170    ClassificationConfig, ClusteringConfig, DatasetRegistry, DatasetResult, RegressionConfig,
171    ScalingMethod, SyntheticDataConfig,
172};
173
174// #[cfg(feature = "arrow")]
175pub use arrow_integration::{arrow_utils, ArrowDataset};
176
177#[cfg(feature = "hdf5-support")]
178pub use hdf5_integration::{hdf5_utils, HDF5DatasetBuilder, HDF5Metadata, HDF5TensorDataset};
179
180#[cfg(feature = "parquet-support")]
181pub use parquet_integration::{parquet_utils, ParquetDataset, ParquetDatasetBuilder, ParquetError};
182
183pub use tfrecord_integration::{
184    tfrecord_utils, Example, FeatureValue, TFRecordDataset, TFRecordDatasetBuilder, TFRecordError,
185    TFRecordReader,
186};
187
188pub use database_integration::{
189    database_utils, DatabaseBackend, DatabaseConfig, DatabaseConnection, DatabaseDataset,
190    DatabaseDatasetBuilder, DatabaseError, DatabaseRow, DatabaseValue,
191};
192
193// Version information
194pub const VERSION: &str = env!("CARGO_PKG_VERSION");
195pub const VERSION_MAJOR: u32 = 0;
196pub const VERSION_MINOR: u32 = 1;
197pub const VERSION_PATCH: u32 = 0;
198
199/// Prelude module for convenient imports
200pub mod prelude {
201    pub use crate::builtin::*;
202
203    // Collate - only re-export commonly used items, not internal modules
204    pub use crate::collate::{
205        collate_fn, Collate, CollateBuilder, CollateFn, CollateStrategy, DefaultCollate,
206        DynamicBatchCollate, PadCollate, TensorStacker,
207    };
208
209    pub use crate::dataloader::*;
210    pub use crate::dataset::*;
211    pub use crate::error::{DataError, ErrorContext, Result, WithContext};
212
213    // Sampler - only re-export commonly used items, not internal modules
214    pub use crate::sampler::{
215        AcquisitionStrategy, ActiveLearningSampler, AdaptiveSampler, BatchSampler, BatchingSampler,
216        CurriculumSampler, CurriculumStrategy, DistributedSampler, ImportanceSampler,
217        RandomSampler, Sampler, SamplerIterator, SequentialSampler, StratifiedSampler,
218        WeightedRandomSampler,
219    };
220
221    pub use crate::text::*;
222    pub use crate::utils::{
223        batch, concurrent, memory, performance, validate_not_empty, validate_positive,
224        validate_probability, validate_range, Cacheable, Configurable, ProgressTracker, Resettable,
225    };
226
227    #[cfg(feature = "std")]
228    pub use crate::dataset::SharedMemoryDataset;
229
230    #[cfg(all(feature = "std", feature = "mmap-support"))]
231    pub use crate::dataset::MemoryMappedDataset;
232
233    #[cfg(feature = "parquet-support")]
234    pub use crate::parquet_integration::{ParquetDataset, ParquetDatasetBuilder};
235
236    pub use crate::tfrecord_integration::{
237        TFRecordDataset, TFRecordDatasetBuilder, TFRecordReader,
238    };
239
240    pub use crate::database_integration::{
241        DatabaseConfig, DatabaseDataset, DatabaseDatasetBuilder,
242    };
243
244    #[cfg(feature = "privacy")]
245    pub use crate::privacy::{
246        DPMechanism, PrivacyBudget, PrivacyBuilder, PrivateDataset, PrivateSampler,
247    };
248
249    #[cfg(feature = "federated")]
250    pub use crate::federated::{
251        ClientSelectionStrategy, FederatedConfig, FederatedDataset, FederatedDatasetBuilder,
252    };
253
254    #[cfg(all(target_arch = "wasm32", feature = "wasm"))]
255    pub use crate::wasm::{wasm_utils, WasmDataLoader, WasmDataset};
256
257    // NOTE: Async dataloader utilities are planned for future releases
258    // #[cfg(feature = "async-support")]
259    // pub use crate::dataloader::async_dataloader::{
260    //     async_dataloader, AsyncDataLoader, AsyncDataLoaderBuilder,
261    // };
262}
263
264#[cfg(test)]
265mod tests {
266    use super::*;
267
268    #[test]
269    fn test_imports() {
270        // Basic smoke test
271        let _ = SequentialSampler::new(10);
272    }
273}