Skip to main content

dataset_core/
lib.rs

1//! A generic, thread-safe dataset container with lazy loading and caching.
2//!
3//! `dataset-core` provides [`Dataset<T>`], a lightweight wrapper that pairs a storage
4//! directory with a lazily-initialized value of any type `T`. The actual downloading
5//! and parsing logic is supplied by the caller through a loader closure, making
6//! `Dataset<T>` suitable for any data source — local files, remote URLs, databases,
7//! or in-memory generation.
8//!
9//! On top of this core type, the crate offers **optional** feature-gated modules:
10//!
11//! - **`utils`** — helper functions for downloading files, extracting archives,
12//!   verifying SHA-256 hashes, and managing temporary directories.
13//! - **`datasets`** — ready-to-use loaders for classic ML datasets (Iris, Boston
14//!   Housing, Diabetes, Titanic, Wine Quality). These also serve as reference
15//!   implementations showing how to wrap `Dataset<T>` for a concrete use case.
16//!
17//! # Feature Flags
18//!
19//! | Feature    | What it enables                                                  |
20//! |------------|------------------------------------------------------------------|
21//! | `utils`    | `download_to`, `unzip`, `create_temp_dir`, `file_sha256_matches`, `acquire_dataset`, and the `error` module |
22//! | `datasets` | All built-in dataset loaders (implies `utils`)                   |
23//!
24//! With no features enabled, only `Dataset<T>` is available — only depend on `std::sync::OnceLock`.
25//!
26//! # Quick Start — `Dataset<T>`
27//!
28//! ```rust
29//! use dataset_core::Dataset;
30//!
31//! fn my_loader(dir: &str) -> Result<Vec<String>, std::io::Error> {
32//!     // In a real use case you would read/download files from `dir`.
33//!     Ok(vec!["hello".to_string(), "world".to_string()])
34//! }
35//!
36//! let ds: Dataset<Vec<String>> = Dataset::new("./my_data");
37//!
38//! // First call runs the loader; subsequent calls return the cached reference.
39//! let data = ds.load(my_loader).unwrap();
40//! assert_eq!(data.len(), 2);
41//!
42//! let data_again = ds.load(my_loader).unwrap();
43//! assert!(std::ptr::eq(data, data_again)); // same reference, no reload
44//! ```
45//!
46//! # Built-in Datasets (feature `datasets`)
47//!
48//! | Dataset              | Samples | Features | Task Type      |
49//! |----------------------|---------|----------|----------------|
50//! | Iris                 | 150     | 4        | Classification |
51//! | Boston Housing       | 506     | 13       | Regression     |
52//! | Diabetes             | 768     | 8        | Classification |
53//! | Titanic              | 891     | 11       | Classification |
54//! | Wine Quality (Red)   | 1,599   | 11       | Regression     |
55//! | Wine Quality (White) | 4,898   | 11       | Regression     |
56//!
57//! ```rust,ignore
58//! use dataset_core::datasets::iris::Iris;
59//!
60//! let iris = Iris::new("./data");
61//! let (features, labels) = iris.data().unwrap();
62//! assert_eq!(features.shape(), &[150, 4]);
63//! ```
64//!
65//! # Utility Functions (feature `utils`)
66//!
67//! - `download_to` — download a remote file into a directory
68//! - `unzip` — extract a ZIP archive
69//! - `create_temp_dir` — create a self-cleaning temporary directory
70//! - `file_sha256_matches` — verify a file's SHA-256 hash
71//! - `acquire_dataset` — cache-aware dataset acquisition workflow
72//!   (temp dir → prepare → optional hash check → move to final location)
73
74#[cfg(feature = "utils")]
75pub use error::{DataFormatErrorKind, DatasetError};
76use std::sync::OnceLock;
77#[cfg(feature = "utils")]
78pub use utils::{acquire_dataset, create_temp_dir, download_to, file_sha256_matches, unzip};
79
80/// A generic, thread-safe dataset container with lazy loading and in-memory caching.
81///
82/// `Dataset<T>` is a thin caching wrapper that holds a `storage_dir` (the directory
83/// where dataset files are stored on disk) and a lazily-initialized value of type `T`.
84/// The actual downloading and parsing logic is provided by the caller through a loader
85/// closure passed to [`Dataset::load`].
86///
87/// This struct is designed to be the building block for both the built-in datasets
88/// shipped with this crate and any custom datasets defined by external users.
89///
90/// # Type Parameter
91///
92/// - `T` - The type of the parsed dataset. Can be any type, such as
93///   `(Array2<f64>, Array1<f64>)`, a custom struct, or any other data representation.
94///   `T` must implement `Send + Sync` for `Dataset<T>` to be shared across threads.
95///
96/// # Thread Safety
97///
98/// `Dataset<T>` is `Send + Sync` when `T` is `Send + Sync`. The internal `OnceLock`
99/// ensures that the loader closure runs at most once, even when multiple threads call
100/// [`Dataset::load`] concurrently.
101///
102/// # Example
103///
104/// ```rust
105/// use dataset_core::Dataset;
106///
107/// // Define a simple loader that reads a value from the storage directory path.
108/// // The loader can return any error type you choose.
109/// fn my_loader(dir: &str) -> Result<Vec<String>, std::io::Error> {
110///     // In a real use case, you would download/read files from `dir`.
111///     // Here we just demonstrate the caching behavior.
112///     Ok(vec!["hello".to_string(), "world".to_string()])
113/// }
114///
115/// let dataset: Dataset<Vec<String>> = Dataset::new("./my_data");
116///
117/// // The first call to `load` triggers the loader
118/// let data = dataset.load(my_loader).unwrap();
119/// assert_eq!(data.len(), 2);
120///
121/// // Subsequent calls return the cached reference instantly
122/// let data_again = dataset.load(my_loader).unwrap();
123/// assert!(std::ptr::eq(data, data_again)); // same reference, no re-load
124///
125/// // Check whether data has been loaded
126/// assert!(dataset.is_loaded());
127/// ```
128pub struct Dataset<T> {
129    storage_dir: String,
130    data: OnceLock<T>,
131}
132
133impl<T> Dataset<T> {
134    /// Create a new `Dataset` instance without loading any data.
135    ///
136    /// This is a lightweight operation that only stores the storage directory path.
137    /// No I/O or network requests are performed until [`Dataset::load`] is called.
138    ///
139    /// # Parameters
140    ///
141    /// - `storage_dir` - Directory where dataset files will be stored. The directory
142    ///   will be created automatically when the loader runs if it does not exist.
143    ///
144    /// # Returns
145    ///
146    /// A new `Dataset<T>` instance ready for lazy loading.
147    pub fn new(storage_dir: &str) -> Self {
148        Dataset {
149            storage_dir: storage_dir.to_string(),
150            data: OnceLock::new(),
151        }
152    }
153
154    /// Load the dataset, executing the loader on first call and caching the result.
155    ///
156    /// On the first call, the `loader` closure is invoked with the storage directory
157    /// path. The returned value is cached internally. All subsequent calls — from any
158    /// thread — return a reference to the cached value without running the loader again.
159    ///
160    /// # Parameters
161    ///
162    /// - `loader` - A closure or function that takes the storage directory path (`&str`)
163    ///   and returns `Result<T, E>`. This is where you perform downloading,
164    ///   file I/O, and parsing. The loader is only called once; if the data is already
165    ///   cached, it is ignored.
166    ///
167    /// # Returns
168    ///
169    /// - `Ok(&T)` - A reference to the cached dataset.
170    ///
171    /// # Errors
172    ///
173    /// Returns any error produced by the `loader` closure on first invocation.
174    /// Once data is successfully loaded and cached, this method never returns an error.
175    pub fn load<E>(&self, loader: impl FnOnce(&str) -> Result<T, E>) -> Result<&T, E> {
176        if let Some(data) = self.data.get() {
177            return Ok(data);
178        }
179
180        let value = loader(&self.storage_dir)?;
181        let _ = self.data.set(value);
182
183        Ok(self
184            .data
185            .get()
186            .expect("data should be set after successful load"))
187    }
188
189    /// Check whether the dataset has been loaded into memory.
190    ///
191    /// # Returns
192    ///
193    /// `true` if [`Dataset::load`] has been called successfully at least once,
194    /// `false` otherwise.
195    pub fn is_loaded(&self) -> bool {
196        self.data.get().is_some()
197    }
198
199    /// Get the storage directory path.
200    ///
201    /// # Returns
202    ///
203    /// The storage directory path as a string slice.
204    pub fn storage_dir(&self) -> &str {
205        &self.storage_dir
206    }
207}
208
209impl<T> std::fmt::Debug for Dataset<T> {
210    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
211        f.debug_struct("Dataset")
212            .field("storage_dir", &self.storage_dir)
213            .field("data_loaded", &self.is_loaded())
214            .finish()
215    }
216}
217
218/// Error handling module.
219///
220/// Provides structured error types for dataset loading operations including
221/// download failures, validation errors, I/O errors, and detailed data format
222/// errors with line numbers and contextual information for debugging.
223#[cfg(feature = "utils")]
224pub mod error;
225
226/// Utility functions for dataset authors.
227///
228/// Provides helpers for downloading files, extracting archives, verifying
229/// SHA256 hashes, and managing the dataset acquisition workflow.
230#[cfg(feature = "utils")]
231pub mod utils;
232
233/// Built-in dataset implementations.
234///
235/// Contains ready-to-use loaders for common machine learning datasets.
236/// Each submodule also serves as an example of how to wrap [`Dataset<T>`]
237/// to implement a custom dataset loader.
238#[cfg(feature = "datasets")]
239pub mod datasets;