Skip to main content

dataset_core/
lib.rs

1//! A generic, thread-safe dataset container with lazy loading and caching.
2//!
3//! `dataset-core` provides [`Dataset<T>`], a lightweight wrapper that pairs a storage
4//! directory with a lazily-initialized value of any type `T`. The actual downloading
5//! and parsing logic is supplied by the caller through a loader closure, making
6//! `Dataset<T>` suitable for any data source — local files, remote URLs, databases,
7//! or in-memory generation.
8//!
9//! On top of this core type, the crate offers an **optional** feature-gated module:
10//!
11//! - **`utils`** — helper functions for downloading files, extracting archives,
12//!   verifying SHA-256 hashes, and managing temporary directories.
13//!
14//! Ready-to-use loaders for classic ML datasets (Iris, Boston Housing, Diabetes,
15//! Titanic, Wine Quality) live in the companion crate
16//! [`dataset-ml`](https://crates.io/crates/dataset-ml), which depends on
17//! `dataset-core` with the `utils` feature enabled and serves as the reference
18//! implementation for wrapping `Dataset<T>`.
19//!
20//! # Feature Flags
21//!
22//! | Feature | What it enables                                                  |
23//! |---------|------------------------------------------------------------------|
24//! | `utils` | `download_to`, `unzip`, `create_temp_dir`, `file_sha256_matches`, `acquire_dataset`, and the `error` module |
25//!
26//! With no features enabled, only `Dataset<T>` is available — depending only on
27//! `std::sync::OnceLock`.
28//!
29//! # Quick Start — `Dataset<T>`
30//!
31//! ```rust
32//! use dataset_core::Dataset;
33//!
34//! fn my_loader(dir: &str) -> Result<Vec<String>, std::io::Error> {
35//!     // In a real use case you would read/download files from `dir`.
36//!     Ok(vec!["hello".to_string(), "world".to_string()])
37//! }
38//!
39//! let ds: Dataset<Vec<String>> = Dataset::new("./my_data");
40//!
41//! // First call runs the loader; subsequent calls return the cached reference.
42//! let data = ds.load(my_loader).unwrap();
43//! assert_eq!(data.len(), 2);
44//!
45//! let data_again = ds.load(my_loader).unwrap();
46//! assert!(std::ptr::eq(data, data_again)); // same reference, no reload
47//! ```
48//!
49//! # Utility Functions (feature `utils`)
50//!
51//! - `download_to` — download a remote file into a directory
52//! - `unzip` — extract a ZIP archive
53//! - `create_temp_dir` — create a self-cleaning temporary directory
54//! - `file_sha256_matches` — verify a file's SHA-256 hash
55//! - `acquire_dataset` — cache-aware dataset acquisition workflow
56//!   (temp dir → prepare → optional hash check → move to final location)
57
58#[cfg(feature = "utils")]
59pub use error::{DataFormatErrorKind, DatasetError};
60use std::sync::OnceLock;
61#[cfg(feature = "utils")]
62pub use utils::{acquire_dataset, create_temp_dir, download_to, file_sha256_matches, unzip};
63
64/// A generic, thread-safe dataset container with lazy loading and in-memory caching.
65///
66/// `Dataset<T>` is a thin caching wrapper that holds a `storage_dir` (the directory
67/// where dataset files are stored on disk) and a lazily-initialized value of type `T`.
68/// The actual downloading and parsing logic is provided by the caller through a loader
69/// closure passed to [`Dataset::load`].
70///
71/// This struct is designed to be the building block for both the built-in datasets
72/// shipped with this crate and any custom datasets defined by external users.
73///
74/// # Type Parameter
75///
76/// - `T` - The type of the parsed dataset. Can be any type, such as
77///   `(Array2<f64>, Array1<f64>)`, a custom struct, or any other data representation.
78///   `T` must implement `Send + Sync` for `Dataset<T>` to be shared across threads.
79///
80/// # Thread Safety
81///
82/// `Dataset<T>` is `Send + Sync` when `T` is `Send + Sync`. The internal `OnceLock`
83/// ensures that the loader closure runs at most once, even when multiple threads call
84/// [`Dataset::load`] concurrently.
85///
86/// # Example
87///
88/// ```rust
89/// use dataset_core::Dataset;
90///
91/// // Define a simple loader that reads a value from the storage directory path.
92/// // The loader can return any error type you choose.
93/// fn my_loader(dir: &str) -> Result<Vec<String>, std::io::Error> {
94///     // In a real use case, you would download/read files from `dir`.
95///     // Here we just demonstrate the caching behavior.
96///     Ok(vec!["hello".to_string(), "world".to_string()])
97/// }
98///
99/// let dataset: Dataset<Vec<String>> = Dataset::new("./my_data");
100///
101/// // The first call to `load` triggers the loader
102/// let data = dataset.load(my_loader).unwrap();
103/// assert_eq!(data.len(), 2);
104///
105/// // Subsequent calls return the cached reference instantly
106/// let data_again = dataset.load(my_loader).unwrap();
107/// assert!(std::ptr::eq(data, data_again)); // same reference, no re-load
108///
109/// // Check whether data has been loaded
110/// assert!(dataset.is_loaded());
111/// ```
112pub struct Dataset<T> {
113    storage_dir: String,
114    data: OnceLock<T>,
115}
116
117impl<T> Dataset<T> {
118    /// Create a new `Dataset` instance without loading any data.
119    ///
120    /// This is a lightweight operation that only stores the storage directory path.
121    /// No I/O or network requests are performed until [`Dataset::load`] is called.
122    ///
123    /// # Parameters
124    ///
125    /// - `storage_dir` - Directory where dataset files will be stored. The directory
126    ///   will be created automatically when the loader runs if it does not exist.
127    ///
128    /// # Returns
129    ///
130    /// A new `Dataset<T>` instance ready for lazy loading.
131    pub fn new(storage_dir: &str) -> Self {
132        Dataset {
133            storage_dir: storage_dir.to_string(),
134            data: OnceLock::new(),
135        }
136    }
137
138    /// Load the dataset, executing the loader on first call and caching the result.
139    ///
140    /// On the first call, the `loader` closure is invoked with the storage directory
141    /// path. The returned value is cached internally. All subsequent calls — from any
142    /// thread — return a reference to the cached value without running the loader again.
143    ///
144    /// # Parameters
145    ///
146    /// - `loader` - A closure or function that takes the storage directory path (`&str`)
147    ///   and returns `Result<T, E>`. This is where you perform downloading,
148    ///   file I/O, and parsing. The loader is only called once; if the data is already
149    ///   cached, it is ignored.
150    ///
151    /// # Returns
152    ///
153    /// - `Ok(&T)` - A reference to the cached dataset.
154    ///
155    /// # Errors
156    ///
157    /// Returns any error produced by the `loader` closure on first invocation.
158    /// Once data is successfully loaded and cached, this method never returns an error.
159    pub fn load<E>(&self, loader: impl FnOnce(&str) -> Result<T, E>) -> Result<&T, E> {
160        if let Some(data) = self.data.get() {
161            return Ok(data);
162        }
163
164        let value = loader(&self.storage_dir)?;
165        let _ = self.data.set(value);
166
167        Ok(self
168            .data
169            .get()
170            .expect("data should be set after successful load"))
171    }
172
173    /// Check whether the dataset has been loaded into memory.
174    ///
175    /// # Returns
176    ///
177    /// `true` if [`Dataset::load`] has been called successfully at least once,
178    /// `false` otherwise.
179    pub fn is_loaded(&self) -> bool {
180        self.data.get().is_some()
181    }
182
183    /// Get the storage directory path.
184    ///
185    /// # Returns
186    ///
187    /// The storage directory path as a string slice.
188    pub fn storage_dir(&self) -> &str {
189        &self.storage_dir
190    }
191}
192
193impl<T> std::fmt::Debug for Dataset<T> {
194    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
195        f.debug_struct("Dataset")
196            .field("storage_dir", &self.storage_dir)
197            .field("data_loaded", &self.is_loaded())
198            .finish()
199    }
200}
201
202/// Error handling module.
203///
204/// Provides structured error types for dataset loading operations including
205/// download failures, validation errors, I/O errors, and detailed data format
206/// errors with line numbers and contextual information for debugging.
207#[cfg(feature = "utils")]
208pub mod error;
209
210/// Utility functions for dataset authors.
211///
212/// Provides helpers for downloading files, extracting archives, verifying
213/// SHA256 hashes, and managing the dataset acquisition workflow.
214#[cfg(feature = "utils")]
215pub mod utils;