dataset_core/lib.rs
1//! A generic, thread-safe dataset container with lazy loading and caching.
2//!
3//! `dataset-core` provides [`Dataset<T>`], a lightweight wrapper that pairs a storage
4//! directory with a lazily-initialized value of any type `T`. The actual downloading
5//! and parsing logic is supplied by the caller through a loader closure, making
6//! `Dataset<T>` suitable for any data source — local files, remote URLs, databases,
7//! or in-memory generation.
8//!
9//! On top of this core type, the crate offers **optional** feature-gated modules:
10//!
11//! - **`utils`** — helper functions for downloading files, extracting archives,
12//! verifying SHA-256 hashes, and managing temporary directories.
13//! - **`datasets`** — ready-to-use loaders for classic ML datasets (Iris, Boston
14//! Housing, Diabetes, Titanic, Wine Quality). These also serve as reference
15//! implementations showing how to wrap `Dataset<T>` for a concrete use case.
16//!
17//! # Feature Flags
18//!
19//! | Feature | What it enables |
20//! |------------|------------------------------------------------------------------|
21//! | `utils` | `download_to`, `unzip`, `create_temp_dir`, `file_sha256_matches`, `acquire_dataset`, and the `error` module |
22//! | `datasets` | All built-in dataset loaders (implies `utils`) |
23//!
24//! With no features enabled, only `Dataset<T>` is available — only depend on `std::sync::OnceLock`.
25//!
26//! # Quick Start — `Dataset<T>`
27//!
28//! ```rust
29//! use dataset_core::Dataset;
30//!
31//! fn my_loader(dir: &str) -> Result<Vec<String>, std::io::Error> {
32//! // In a real use case you would read/download files from `dir`.
33//! Ok(vec!["hello".to_string(), "world".to_string()])
34//! }
35//!
36//! let ds: Dataset<Vec<String>> = Dataset::new("./my_data");
37//!
38//! // First call runs the loader; subsequent calls return the cached reference.
39//! let data = ds.load(my_loader).unwrap();
40//! assert_eq!(data.len(), 2);
41//!
42//! let data_again = ds.load(my_loader).unwrap();
43//! assert!(std::ptr::eq(data, data_again)); // same reference, no reload
44//! ```
45//!
46//! # Built-in Datasets (feature `datasets`)
47//!
48//! | Dataset | Samples | Features | Task Type |
49//! |----------------------|---------|----------|----------------|
50//! | Iris | 150 | 4 | Classification |
51//! | Boston Housing | 506 | 13 | Regression |
52//! | Diabetes | 768 | 8 | Classification |
53//! | Titanic | 891 | 11 | Classification |
54//! | Wine Quality (Red) | 1,599 | 11 | Regression |
55//! | Wine Quality (White) | 4,898 | 11 | Regression |
56//!
57//! ```rust,ignore
58//! use dataset_core::datasets::iris::Iris;
59//!
60//! let iris = Iris::new("./data");
61//! let (features, labels) = iris.data().unwrap();
62//! assert_eq!(features.shape(), &[150, 4]);
63//! ```
64//!
65//! # Utility Functions (feature `utils`)
66//!
67//! - `download_to` — download a remote file into a directory
68//! - `unzip` — extract a ZIP archive
69//! - `create_temp_dir` — create a self-cleaning temporary directory
70//! - `file_sha256_matches` — verify a file's SHA-256 hash
71//! - `acquire_dataset` — cache-aware dataset acquisition workflow
72//! (temp dir → prepare → optional hash check → move to final location)
73
74#[cfg(feature = "utils")]
75pub use error::{DataFormatErrorKind, DatasetError};
76use std::sync::OnceLock;
77#[cfg(feature = "utils")]
78pub use utils::{acquire_dataset, create_temp_dir, download_to, file_sha256_matches, unzip};
79
80/// A generic, thread-safe dataset container with lazy loading and in-memory caching.
81///
82/// `Dataset<T>` is a thin caching wrapper that holds a `storage_dir` (the directory
83/// where dataset files are stored on disk) and a lazily-initialized value of type `T`.
84/// The actual downloading and parsing logic is provided by the caller through a loader
85/// closure passed to [`Dataset::load`].
86///
87/// This struct is designed to be the building block for both the built-in datasets
88/// shipped with this crate and any custom datasets defined by external users.
89///
90/// # Type Parameter
91///
92/// - `T` - The type of the parsed dataset. Can be any type, such as
93/// `(Array2<f64>, Array1<f64>)`, a custom struct, or any other data representation.
94/// `T` must implement `Send + Sync` for `Dataset<T>` to be shared across threads.
95///
96/// # Thread Safety
97///
98/// `Dataset<T>` is `Send + Sync` when `T` is `Send + Sync`. The internal `OnceLock`
99/// ensures that the loader closure runs at most once, even when multiple threads call
100/// [`Dataset::load`] concurrently.
101///
102/// # Example
103///
104/// ```rust
105/// use dataset_core::Dataset;
106///
107/// // Define a simple loader that reads a value from the storage directory path.
108/// // The loader can return any error type you choose.
109/// fn my_loader(dir: &str) -> Result<Vec<String>, std::io::Error> {
110/// // In a real use case, you would download/read files from `dir`.
111/// // Here we just demonstrate the caching behavior.
112/// Ok(vec!["hello".to_string(), "world".to_string()])
113/// }
114///
115/// let dataset: Dataset<Vec<String>> = Dataset::new("./my_data");
116///
117/// // The first call to `load` triggers the loader
118/// let data = dataset.load(my_loader).unwrap();
119/// assert_eq!(data.len(), 2);
120///
121/// // Subsequent calls return the cached reference instantly
122/// let data_again = dataset.load(my_loader).unwrap();
123/// assert!(std::ptr::eq(data, data_again)); // same reference, no re-load
124///
125/// // Check whether data has been loaded
126/// assert!(dataset.is_loaded());
127/// ```
128pub struct Dataset<T> {
129 storage_dir: String,
130 data: OnceLock<T>,
131}
132
133impl<T> Dataset<T> {
134 /// Create a new `Dataset` instance without loading any data.
135 ///
136 /// This is a lightweight operation that only stores the storage directory path.
137 /// No I/O or network requests are performed until [`Dataset::load`] is called.
138 ///
139 /// # Parameters
140 ///
141 /// - `storage_dir` - Directory where dataset files will be stored. The directory
142 /// will be created automatically when the loader runs if it does not exist.
143 ///
144 /// # Returns
145 ///
146 /// A new `Dataset<T>` instance ready for lazy loading.
147 pub fn new(storage_dir: &str) -> Self {
148 Dataset {
149 storage_dir: storage_dir.to_string(),
150 data: OnceLock::new(),
151 }
152 }
153
154 /// Load the dataset, executing the loader on first call and caching the result.
155 ///
156 /// On the first call, the `loader` closure is invoked with the storage directory
157 /// path. The returned value is cached internally. All subsequent calls — from any
158 /// thread — return a reference to the cached value without running the loader again.
159 ///
160 /// # Parameters
161 ///
162 /// - `loader` - A closure or function that takes the storage directory path (`&str`)
163 /// and returns `Result<T, E>`. This is where you perform downloading,
164 /// file I/O, and parsing. The loader is only called once; if the data is already
165 /// cached, it is ignored.
166 ///
167 /// # Returns
168 ///
169 /// - `Ok(&T)` - A reference to the cached dataset.
170 ///
171 /// # Errors
172 ///
173 /// Returns any error produced by the `loader` closure on first invocation.
174 /// Once data is successfully loaded and cached, this method never returns an error.
175 pub fn load<E>(&self, loader: impl FnOnce(&str) -> Result<T, E>) -> Result<&T, E> {
176 if let Some(data) = self.data.get() {
177 return Ok(data);
178 }
179
180 let value = loader(&self.storage_dir)?;
181 let _ = self.data.set(value);
182
183 Ok(self
184 .data
185 .get()
186 .expect("data should be set after successful load"))
187 }
188
189 /// Check whether the dataset has been loaded into memory.
190 ///
191 /// # Returns
192 ///
193 /// `true` if [`Dataset::load`] has been called successfully at least once,
194 /// `false` otherwise.
195 pub fn is_loaded(&self) -> bool {
196 self.data.get().is_some()
197 }
198
199 /// Get the storage directory path.
200 ///
201 /// # Returns
202 ///
203 /// The storage directory path as a string slice.
204 pub fn storage_dir(&self) -> &str {
205 &self.storage_dir
206 }
207}
208
209impl<T> std::fmt::Debug for Dataset<T> {
210 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
211 f.debug_struct("Dataset")
212 .field("storage_dir", &self.storage_dir)
213 .field("data_loaded", &self.is_loaded())
214 .finish()
215 }
216}
217
218/// Error handling module.
219///
220/// Provides structured error types for dataset loading operations including
221/// download failures, validation errors, I/O errors, and detailed data format
222/// errors with line numbers and contextual information for debugging.
223#[cfg(feature = "utils")]
224pub mod error;
225
226/// Utility functions for dataset authors.
227///
228/// Provides helpers for downloading files, extracting archives, verifying
229/// SHA256 hashes, and managing the dataset acquisition workflow.
230#[cfg(feature = "utils")]
231pub mod utils;
232
233/// Built-in dataset implementations.
234///
235/// Contains ready-to-use loaders for common machine learning datasets.
236/// Each submodule also serves as an example of how to wrap [`Dataset<T>`]
237/// to implement a custom dataset loader.
238#[cfg(feature = "datasets")]
239pub mod datasets;