Skip to main content

hdf5_reader/
lib.rs

1pub mod checksum;
2pub mod error;
3pub mod io;
4
5// Level 0 — File Metadata
6pub mod superblock;
7
8// Level 1 — File Infrastructure
9pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod symbol_table;
18
19// Level 2 — Data Objects
20pub mod messages;
21pub mod object_header;
22
23// High-level API
24pub mod attribute_api;
25pub mod dataset;
26pub mod datatype_api;
27pub mod group;
28pub mod reference;
29pub mod storage;
30
31// Filters
32pub mod filters;
33
34// Utilities
35pub mod cache;
36
37use std::collections::HashMap;
38use std::path::Path;
39use std::sync::{Arc, OnceLock};
40
41use memmap2::Mmap;
42// parking_lot::Mutex used via fully-qualified paths in HeaderCache and constructors.
43
44use cache::ChunkCache;
45use error::{Error, Result};
46use group::Group;
47use object_header::ObjectHeader;
48use storage::DynStorage;
49use superblock::Superblock;
50
51// Re-exports
52pub use attribute_api::Attribute;
53use dataset::DatasetTemplate;
54pub use dataset::{Dataset, SliceInfo, SliceInfoElem};
55pub use datatype_api::{
56    dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
57    StringPadding, StringSize,
58};
59pub use error::ByteOrder;
60pub use filters::FilterRegistry;
61pub use messages::datatype::Datatype;
62pub use storage::{BytesStorage, FileStorage, MmapStorage, Storage, StorageBuffer};
63
64/// Configuration options for opening an HDF5 file.
65pub struct OpenOptions {
66    /// Maximum bytes for the chunk cache. Default: 64 MiB.
67    pub chunk_cache_bytes: usize,
68    /// Maximum number of chunk cache slots. Default: 521.
69    pub chunk_cache_slots: usize,
70    /// Custom filter registry. If `None`, the default built-in filters are used.
71    pub filter_registry: Option<FilterRegistry>,
72}
73
74impl Default for OpenOptions {
75    fn default() -> Self {
76        OpenOptions {
77            chunk_cache_bytes: 64 * 1024 * 1024,
78            chunk_cache_slots: 521,
79            filter_registry: None,
80        }
81    }
82}
83
84/// Cache for parsed object headers, keyed by file address.
85pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
86
87/// An opened HDF5 file.
88///
89/// This is the main entry point for reading HDF5 files. Storage is random-
90/// access and range-based, so metadata and data reads do not require an eager
91/// whole-file mapping.
92pub struct Hdf5File {
93    context: Arc<FileContext>,
94}
95
96pub(crate) struct FileContext {
97    pub(crate) storage: DynStorage,
98    pub(crate) superblock: Superblock,
99    pub(crate) chunk_cache: Arc<ChunkCache>,
100    pub(crate) header_cache: HeaderCache,
101    pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
102    pub(crate) filter_registry: Arc<FilterRegistry>,
103    full_file_cache: OnceLock<StorageBuffer>,
104}
105
106impl FileContext {
107    pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
108        self.storage.read_range(offset, len)
109    }
110
111    pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
112        if let Some(buffer) = self.full_file_cache.get() {
113            return Ok(buffer.clone());
114        }
115
116        let len = usize::try_from(self.storage.len()).map_err(|_| {
117            Error::InvalidData("file size exceeds platform usize capacity".to_string())
118        })?;
119        let buffer = self.storage.read_range(0, len)?;
120        let _ = self.full_file_cache.set(buffer);
121        Ok(self
122            .full_file_cache
123            .get()
124            .expect("full-file buffer must exist after successful initialization")
125            .clone())
126    }
127
128    pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
129        {
130            let cache = self.header_cache.lock();
131            if let Some(hdr) = cache.get(&addr) {
132                return Ok(Arc::clone(hdr));
133            }
134        }
135
136        let mut hdr = ObjectHeader::parse_at_storage(
137            self.storage.as_ref(),
138            addr,
139            self.superblock.offset_size,
140            self.superblock.length_size,
141        )?;
142        hdr.resolve_shared_messages_storage(
143            self.storage.as_ref(),
144            self.superblock.offset_size,
145            self.superblock.length_size,
146        )?;
147        let arc = Arc::new(hdr);
148        let mut cache = self.header_cache.lock();
149        cache.insert(addr, Arc::clone(&arc));
150        Ok(arc)
151    }
152}
153
154impl Hdf5File {
155    fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
156        let superblock = Superblock::parse_from_storage(storage.as_ref())?;
157        let cache = Arc::new(ChunkCache::new(
158            options.chunk_cache_bytes,
159            options.chunk_cache_slots,
160        ));
161        let registry = options.filter_registry.unwrap_or_default();
162
163        Ok(Hdf5File {
164            context: Arc::new(FileContext {
165                storage,
166                superblock,
167                chunk_cache: cache,
168                header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
169                dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
170                filter_registry: Arc::new(registry),
171                full_file_cache: OnceLock::new(),
172            }),
173        })
174    }
175
176    /// Open an HDF5 file with default options.
177    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
178        Self::open_with_options(path, OpenOptions::default())
179    }
180
181    /// Open an HDF5 file with custom options.
182    pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
183        Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
184    }
185
186    /// Open an HDF5 file from an in-memory byte slice.
187    ///
188    /// The data is copied into an owned buffer.
189    pub fn from_bytes(data: &[u8]) -> Result<Self> {
190        Self::from_bytes_with_options(data, OpenOptions::default())
191    }
192
193    /// Open an HDF5 file from an in-memory byte slice with custom options.
194    ///
195    /// The data is copied into an owned buffer.
196    pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
197        Self::from_vec_with_options(data.to_vec(), options)
198    }
199
200    /// Open an HDF5 file from an owned byte vector without copying.
201    pub fn from_vec(data: Vec<u8>) -> Result<Self> {
202        Self::from_vec_with_options(data, OpenOptions::default())
203    }
204
205    /// Open an HDF5 file from an owned byte vector with custom options.
206    pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
207        Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
208    }
209
210    /// Open an HDF5 file from an existing memory map with custom options.
211    ///
212    /// This avoids remapping when the caller already owns a read-only mapping.
213    pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
214        Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
215    }
216
217    /// Open an HDF5 file from a custom random-access storage backend.
218    pub fn from_storage(storage: DynStorage) -> Result<Self> {
219        Self::from_storage_with_options(storage, OpenOptions::default())
220    }
221
222    /// Open an HDF5 file from a custom random-access storage backend.
223    pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
224        Self::from_storage_impl(storage, options)
225    }
226
227    /// Get the parsed superblock.
228    pub fn superblock(&self) -> &Superblock {
229        &self.context.superblock
230    }
231
232    /// Access the underlying random-access storage backend.
233    pub fn storage(&self) -> &dyn Storage {
234        self.context.storage.as_ref()
235    }
236
237    /// Look up or parse an object header at the given address.
238    ///
239    /// Uses the internal cache to avoid re-parsing the same header.
240    pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
241        self.context.get_or_parse_header(addr)
242    }
243
244    /// Get the root group of the file.
245    pub fn root_group(&self) -> Result<Group> {
246        let addr = self.context.superblock.root_object_header_address()?;
247
248        Ok(Group::new(
249            self.context.clone(),
250            addr,
251            "/".to_string(),
252            addr, // root_address = self
253        ))
254    }
255
256    /// Convenience: get a dataset at a path like "/group1/dataset".
257    pub fn dataset(&self, path: &str) -> Result<Dataset> {
258        let parts: Vec<&str> = path
259            .trim_start_matches('/')
260            .split('/')
261            .filter(|s| !s.is_empty())
262            .collect();
263        let normalized_path = format!("/{}", parts.join("/"));
264
265        if parts.is_empty() {
266            return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
267        }
268
269        if let Some(template) = self
270            .context
271            .dataset_path_cache
272            .lock()
273            .get(&normalized_path)
274            .cloned()
275        {
276            return Ok(Dataset::from_template(self.context.clone(), template));
277        }
278
279        let mut group = self.root_group()?;
280        for &part in &parts[..parts.len() - 1] {
281            group = group.group(part).map_err(|e| e.with_context(path))?;
282        }
283
284        let dataset = group
285            .dataset(parts[parts.len() - 1])
286            .map_err(|e| e.with_context(path))?;
287        self.context
288            .dataset_path_cache
289            .lock()
290            .insert(normalized_path, dataset.template());
291        Ok(dataset)
292    }
293
294    /// Convenience: get a group at a path like "/group1/subgroup".
295    pub fn group(&self, path: &str) -> Result<Group> {
296        let parts: Vec<&str> = path
297            .trim_start_matches('/')
298            .split('/')
299            .filter(|s| !s.is_empty())
300            .collect();
301
302        let mut group = self.root_group()?;
303        for &part in &parts {
304            group = group.group(part)?;
305        }
306
307        Ok(group)
308    }
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn test_open_options_default() {
317        let opts = OpenOptions::default();
318        assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
319        assert_eq!(opts.chunk_cache_slots, 521);
320    }
321
322    #[test]
323    fn test_invalid_file() {
324        let data = b"this is not an HDF5 file";
325        let result = Hdf5File::from_bytes(data);
326        assert!(result.is_err());
327    }
328}