Skip to main content

hdf5_reader/
lib.rs

1pub mod checksum;
2pub mod error;
3pub mod io;
4
5// Level 0 — File Metadata
6pub mod superblock;
7
8// Level 1 — File Infrastructure
9pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod shared_message_table;
18pub mod symbol_table;
19
20// Level 2 — Data Objects
21pub mod messages;
22pub mod object_header;
23
24// High-level API
25pub mod attribute_api;
26pub mod dataset;
27pub mod datatype_api;
28pub mod group;
29pub mod reference;
30pub mod storage;
31
32// Filters
33pub mod filters;
34
35// Utilities
36pub mod cache;
37
38use std::collections::HashMap;
39use std::path::{Path, PathBuf};
40use std::sync::{Arc, OnceLock};
41
42use memmap2::Mmap;
43// parking_lot::Mutex used via fully-qualified paths in HeaderCache and constructors.
44
45use cache::ChunkCache;
46use error::{Error, Result};
47use group::Group;
48use messages::HdfMessage;
49use object_header::ObjectHeader;
50use shared_message_table::SharedMessageTableRef;
51use storage::DynStorage;
52use superblock::Superblock;
53
54// Re-exports
55pub use attribute_api::Attribute;
56use dataset::DatasetTemplate;
57pub use dataset::{Dataset, SliceInfo, SliceInfoElem};
58pub use datatype_api::{
59    dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
60    StringPadding, StringSize,
61};
62pub use error::ByteOrder;
63pub use filters::FilterRegistry;
64pub use messages::datatype::Datatype;
65pub use storage::{BytesStorage, FileStorage, MmapStorage, Storage, StorageBuffer};
66
67/// Configuration options for opening an HDF5 file.
68pub struct OpenOptions {
69    /// Maximum bytes for the chunk cache. Default: 64 MiB.
70    pub chunk_cache_bytes: usize,
71    /// Maximum number of chunk cache slots. Default: 521.
72    pub chunk_cache_slots: usize,
73    /// Custom filter registry. If `None`, the default built-in filters are used.
74    pub filter_registry: Option<FilterRegistry>,
75    /// Resolver for HDF5 external raw data files.
76    pub external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
77    /// Optional resolver for HDF5 external links.
78    pub external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
79}
80
81impl Default for OpenOptions {
82    fn default() -> Self {
83        OpenOptions {
84            chunk_cache_bytes: 64 * 1024 * 1024,
85            chunk_cache_slots: 521,
86            filter_registry: None,
87            external_file_resolver: None,
88            external_link_resolver: None,
89        }
90    }
91}
92
93/// Resolves file names from HDF5 External Data Files messages to storage.
94pub trait ExternalFileResolver: Send + Sync {
95    fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>>;
96}
97
98/// Resolves HDF5 external links to another opened file.
99pub trait ExternalLinkResolver: Send + Sync {
100    fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>>;
101}
102
103/// Filesystem resolver for external raw data files.
104#[derive(Debug, Clone)]
105pub struct FilesystemExternalFileResolver {
106    base_dir: PathBuf,
107}
108
109impl FilesystemExternalFileResolver {
110    pub fn new(base_dir: impl Into<PathBuf>) -> Self {
111        Self {
112            base_dir: base_dir.into(),
113        }
114    }
115
116    fn path_for(&self, filename: &str) -> PathBuf {
117        let path = Path::new(filename);
118        if path.is_absolute() {
119            path.to_path_buf()
120        } else {
121            self.base_dir.join(path)
122        }
123    }
124}
125
126impl ExternalFileResolver for FilesystemExternalFileResolver {
127    fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
128        let path = self.path_for(filename);
129        if !path.exists() {
130            return Ok(None);
131        }
132        Ok(Some(Arc::new(FileStorage::open(path)?)))
133    }
134}
135
136/// Filesystem resolver for external links. Linked files are cached after the
137/// first successful open.
138pub struct FilesystemExternalLinkResolver {
139    base_dir: PathBuf,
140    cache: parking_lot::Mutex<HashMap<PathBuf, Hdf5File>>,
141}
142
143impl FilesystemExternalLinkResolver {
144    pub fn new(base_dir: impl Into<PathBuf>) -> Self {
145        Self {
146            base_dir: base_dir.into(),
147            cache: parking_lot::Mutex::new(HashMap::new()),
148        }
149    }
150
151    fn path_for(&self, filename: &str) -> PathBuf {
152        let path = Path::new(filename);
153        if path.is_absolute() {
154            path.to_path_buf()
155        } else {
156            self.base_dir.join(path)
157        }
158    }
159}
160
161impl ExternalLinkResolver for FilesystemExternalLinkResolver {
162    fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>> {
163        let path = self.path_for(filename);
164        if !path.exists() {
165            return Ok(None);
166        }
167
168        if let Some(file) = self.cache.lock().get(&path).cloned() {
169            return Ok(Some(file));
170        }
171
172        let file = Hdf5File::open(&path)?;
173        self.cache.lock().insert(path, file.clone());
174        Ok(Some(file))
175    }
176}
177
178/// Cache for parsed object headers, keyed by file address.
179pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
180
181/// An opened HDF5 file.
182///
183/// This is the main entry point for reading HDF5 files. Storage is random-
184/// access and range-based, so metadata and data reads do not require an eager
185/// whole-file mapping.
186#[derive(Clone)]
187pub struct Hdf5File {
188    context: Arc<FileContext>,
189}
190
191pub(crate) struct FileContext {
192    pub(crate) storage: DynStorage,
193    pub(crate) superblock: Superblock,
194    pub(crate) chunk_cache: Arc<ChunkCache>,
195    pub(crate) header_cache: HeaderCache,
196    pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
197    pub(crate) filter_registry: Arc<FilterRegistry>,
198    pub(crate) external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
199    pub(crate) external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
200    pub(crate) external_file_cache: parking_lot::Mutex<HashMap<String, DynStorage>>,
201    sohm_table: OnceLock<std::result::Result<Option<SharedMessageTableRef>, String>>,
202    full_file_cache: OnceLock<StorageBuffer>,
203}
204
205impl FileContext {
206    pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
207        self.storage.read_range(offset, len)
208    }
209
210    pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
211        if let Some(buffer) = self.full_file_cache.get() {
212            return Ok(buffer.clone());
213        }
214
215        let len = usize::try_from(self.storage.len()).map_err(|_| {
216            Error::InvalidData("file size exceeds platform usize capacity".to_string())
217        })?;
218        let buffer = self.storage.read_range(0, len)?;
219        let _ = self.full_file_cache.set(buffer);
220        Ok(self
221            .full_file_cache
222            .get()
223            .expect("full-file buffer must exist after successful initialization")
224            .clone())
225    }
226
227    pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
228        {
229            let cache = self.header_cache.lock();
230            if let Some(hdr) = cache.get(&addr) {
231                return Ok(Arc::clone(hdr));
232            }
233        }
234
235        let mut hdr = ObjectHeader::parse_at_storage(
236            self.storage.as_ref(),
237            addr,
238            self.superblock.offset_size,
239            self.superblock.length_size,
240        )?;
241        hdr.resolve_shared_messages_storage_with_sohm(
242            self.storage.as_ref(),
243            self.superblock.offset_size,
244            self.superblock.length_size,
245            |heap_id, message_type| self.resolve_sohm_message(heap_id, message_type),
246        )?;
247        let arc = Arc::new(hdr);
248        let mut cache = self.header_cache.lock();
249        cache.insert(addr, Arc::clone(&arc));
250        Ok(arc)
251    }
252
253    fn resolve_sohm_message(
254        &self,
255        heap_id: &[u8],
256        message_type: u16,
257    ) -> Result<Option<HdfMessage>> {
258        let Some(table) = self.sohm_table()? else {
259            return Ok(None);
260        };
261        table.resolve_heap_message(
262            heap_id,
263            message_type,
264            self.storage.as_ref(),
265            self.superblock.offset_size,
266            self.superblock.length_size,
267        )
268    }
269
270    fn sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
271        let cached = self
272            .sohm_table
273            .get_or_init(|| self.load_sohm_table().map_err(|err| err.to_string()));
274        match cached {
275            Ok(table) => Ok(table.clone()),
276            Err(message) => Err(Error::InvalidData(format!(
277                "failed to load SOHM table: {message}"
278            ))),
279        }
280    }
281
282    fn load_sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
283        let Some(extension_address) = self.superblock.extension_address else {
284            return Ok(None);
285        };
286        let extension = ObjectHeader::parse_at_storage(
287            self.storage.as_ref(),
288            extension_address,
289            self.superblock.offset_size,
290            self.superblock.length_size,
291        )?;
292
293        let shared_table = extension.messages.iter().find_map(|message| match message {
294            HdfMessage::SharedTable(table) => Some(table),
295            _ => None,
296        });
297        let Some(shared_table) = shared_table else {
298            return Ok(None);
299        };
300
301        let table = crate::shared_message_table::SharedMessageTable::parse_at_storage(
302            self.storage.as_ref(),
303            shared_table.table_address,
304            shared_table.num_indices,
305            self.superblock.offset_size,
306        )?;
307        Ok(Some(Arc::new(table)))
308    }
309
310    pub(crate) fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
311        if let Some(storage) = self.external_file_cache.lock().get(filename).cloned() {
312            return Ok(Some(storage));
313        }
314
315        let Some(resolver) = self.external_file_resolver.as_ref() else {
316            return Ok(None);
317        };
318        let Some(storage) = resolver.resolve_external_file(filename)? else {
319            return Ok(None);
320        };
321        self.external_file_cache
322            .lock()
323            .insert(filename.to_string(), storage.clone());
324        Ok(Some(storage))
325    }
326}
327
328impl Hdf5File {
329    fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
330        let superblock = Superblock::parse_from_storage(storage.as_ref())?;
331        let cache = Arc::new(ChunkCache::new(
332            options.chunk_cache_bytes,
333            options.chunk_cache_slots,
334        ));
335        let registry = options.filter_registry.unwrap_or_default();
336        let external_file_resolver = options.external_file_resolver;
337        let external_link_resolver = options.external_link_resolver;
338
339        Ok(Hdf5File {
340            context: Arc::new(FileContext {
341                storage,
342                superblock,
343                chunk_cache: cache,
344                header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
345                dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
346                filter_registry: Arc::new(registry),
347                external_file_resolver,
348                external_link_resolver,
349                external_file_cache: parking_lot::Mutex::new(HashMap::new()),
350                sohm_table: OnceLock::new(),
351                full_file_cache: OnceLock::new(),
352            }),
353        })
354    }
355
356    /// Open an HDF5 file with default options.
357    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
358        Self::open_with_options(path, OpenOptions::default())
359    }
360
361    /// Open an HDF5 file with custom options.
362    pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
363        let path = path.as_ref();
364        let mut options = options;
365        if options.external_file_resolver.is_none() {
366            let base_dir = path
367                .parent()
368                .map(Path::to_path_buf)
369                .unwrap_or_else(|| PathBuf::from("."));
370            options.external_file_resolver =
371                Some(Arc::new(FilesystemExternalFileResolver::new(base_dir)));
372        }
373        Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
374    }
375
376    /// Open an HDF5 file from an in-memory byte slice.
377    ///
378    /// The data is copied into an owned buffer.
379    pub fn from_bytes(data: &[u8]) -> Result<Self> {
380        Self::from_bytes_with_options(data, OpenOptions::default())
381    }
382
383    /// Open an HDF5 file from an in-memory byte slice with custom options.
384    ///
385    /// The data is copied into an owned buffer.
386    pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
387        Self::from_vec_with_options(data.to_vec(), options)
388    }
389
390    /// Open an HDF5 file from an owned byte vector without copying.
391    pub fn from_vec(data: Vec<u8>) -> Result<Self> {
392        Self::from_vec_with_options(data, OpenOptions::default())
393    }
394
395    /// Open an HDF5 file from an owned byte vector with custom options.
396    pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
397        Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
398    }
399
400    /// Open an HDF5 file from an existing memory map with custom options.
401    ///
402    /// This avoids remapping when the caller already owns a read-only mapping.
403    pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
404        Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
405    }
406
407    /// Open an HDF5 file from a custom random-access storage backend.
408    pub fn from_storage(storage: DynStorage) -> Result<Self> {
409        Self::from_storage_with_options(storage, OpenOptions::default())
410    }
411
412    /// Open an HDF5 file from a custom random-access storage backend.
413    pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
414        Self::from_storage_impl(storage, options)
415    }
416
417    /// Get the parsed superblock.
418    pub fn superblock(&self) -> &Superblock {
419        &self.context.superblock
420    }
421
422    /// Access the underlying random-access storage backend.
423    pub fn storage(&self) -> &dyn Storage {
424        self.context.storage.as_ref()
425    }
426
427    /// Look up or parse an object header at the given address.
428    ///
429    /// Uses the internal cache to avoid re-parsing the same header.
430    pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
431        self.context.get_or_parse_header(addr)
432    }
433
434    /// Get the root group of the file.
435    pub fn root_group(&self) -> Result<Group> {
436        let addr = self.context.superblock.root_object_header_address()?;
437
438        Ok(Group::new(
439            self.context.clone(),
440            addr,
441            "/".to_string(),
442            addr, // root_address = self
443        ))
444    }
445
446    /// Convenience: get a dataset at a path like "/group1/dataset".
447    pub fn dataset(&self, path: &str) -> Result<Dataset> {
448        let parts: Vec<&str> = path
449            .trim_start_matches('/')
450            .split('/')
451            .filter(|s| !s.is_empty())
452            .collect();
453        let normalized_path = format!("/{}", parts.join("/"));
454
455        if parts.is_empty() {
456            return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
457        }
458
459        if let Some(template) = self
460            .context
461            .dataset_path_cache
462            .lock()
463            .get(&normalized_path)
464            .cloned()
465        {
466            return Ok(Dataset::from_template(self.context.clone(), template));
467        }
468
469        let mut group = self.root_group()?;
470        for &part in &parts[..parts.len() - 1] {
471            group = group.group(part).map_err(|e| e.with_context(path))?;
472        }
473
474        let dataset = group
475            .dataset(parts[parts.len() - 1])
476            .map_err(|e| e.with_context(path))?;
477        if Arc::ptr_eq(&dataset.context, &self.context) {
478            self.context
479                .dataset_path_cache
480                .lock()
481                .insert(normalized_path, dataset.template());
482        }
483        Ok(dataset)
484    }
485
486    /// Convenience: get a group at a path like "/group1/subgroup".
487    pub fn group(&self, path: &str) -> Result<Group> {
488        let parts: Vec<&str> = path
489            .trim_start_matches('/')
490            .split('/')
491            .filter(|s| !s.is_empty())
492            .collect();
493
494        let mut group = self.root_group()?;
495        for &part in &parts {
496            group = group.group(part)?;
497        }
498
499        Ok(group)
500    }
501}
502
503#[cfg(test)]
504mod tests {
505    use super::*;
506
507    #[test]
508    fn test_open_options_default() {
509        let opts = OpenOptions::default();
510        assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
511        assert_eq!(opts.chunk_cache_slots, 521);
512    }
513
514    #[test]
515    fn test_invalid_file() {
516        let data = b"this is not an HDF5 file";
517        let result = Hdf5File::from_bytes(data);
518        assert!(result.is_err());
519    }
520
521    #[test]
522    fn filesystem_external_file_resolver_reads_relative_file() {
523        let dir = tempfile::tempdir().unwrap();
524        let path = dir.path().join("raw.bin");
525        std::fs::write(&path, b"abcdef").unwrap();
526
527        let resolver = FilesystemExternalFileResolver::new(dir.path());
528        let storage = resolver
529            .resolve_external_file("raw.bin")
530            .unwrap()
531            .expect("raw file should resolve");
532        let bytes = storage.read_range(2, 3).unwrap();
533        assert_eq!(bytes.as_ref(), b"cde");
534    }
535}