Skip to main content

hdf5_reader/
lib.rs

1pub mod checksum;
2pub mod error;
3pub mod io;
4
5// Level 0 — File Metadata
6pub mod superblock;
7
8// Level 1 — File Infrastructure
9pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod shared_message_table;
18pub mod symbol_table;
19
20// Level 2 — Data Objects
21pub mod messages;
22pub mod object_header;
23
24// High-level API
25pub mod attribute_api;
26pub mod dataset;
27pub mod datatype_api;
28pub mod group;
29pub mod reference;
30pub mod storage;
31
32// Filters
33pub mod filters;
34
35// Utilities
36pub mod cache;
37
38use std::collections::HashMap;
39use std::path::{Path, PathBuf};
40use std::sync::{Arc, OnceLock};
41
42use memmap2::Mmap;
43// parking_lot::Mutex used via fully-qualified paths in HeaderCache and constructors.
44
45use cache::ChunkCache;
46use error::{Error, Result};
47use group::Group;
48use messages::HdfMessage;
49use object_header::ObjectHeader;
50use shared_message_table::SharedMessageTableRef;
51use storage::DynStorage;
52use superblock::Superblock;
53
54// Re-exports
55pub use attribute_api::Attribute;
56pub use cache::ChunkCacheStats;
57use dataset::DatasetTemplate;
58pub use dataset::{Dataset, DatasetChunk, DatasetChunkIterator, SliceInfo, SliceInfoElem};
59pub use datatype_api::{
60    dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
61    StringPadding, StringSize, VarLenKind,
62};
63pub use error::ByteOrder;
64pub use filters::FilterRegistry;
65pub use messages::datatype::Datatype;
66pub use storage::{
67    BlockCacheStats, BlockCacheStorage, BytesStorage, FileStorage, MmapStorage,
68    RangeRequestStorage, Storage, StorageBuffer,
69};
70
71/// Configuration options for opening an HDF5 file.
72pub struct OpenOptions {
73    /// Maximum bytes for the chunk cache. Default: 64 MiB.
74    pub chunk_cache_bytes: usize,
75    /// Maximum number of chunk cache slots. Default: 521.
76    pub chunk_cache_slots: usize,
77    /// Custom filter registry. If `None`, the default built-in filters are used.
78    pub filter_registry: Option<FilterRegistry>,
79    /// Resolver for HDF5 external raw data files.
80    pub external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
81    /// Optional resolver for HDF5 external links.
82    pub external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
83}
84
85impl Default for OpenOptions {
86    fn default() -> Self {
87        OpenOptions {
88            chunk_cache_bytes: 64 * 1024 * 1024,
89            chunk_cache_slots: 521,
90            filter_registry: None,
91            external_file_resolver: None,
92            external_link_resolver: None,
93        }
94    }
95}
96
97/// Resolves file names from HDF5 External Data Files messages to storage.
98pub trait ExternalFileResolver: Send + Sync {
99    fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>>;
100}
101
102/// Resolves HDF5 external links to another opened file.
103pub trait ExternalLinkResolver: Send + Sync {
104    fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>>;
105}
106
107/// Filesystem resolver for external raw data files.
108#[derive(Debug, Clone)]
109pub struct FilesystemExternalFileResolver {
110    base_dir: PathBuf,
111}
112
113impl FilesystemExternalFileResolver {
114    pub fn new(base_dir: impl Into<PathBuf>) -> Self {
115        Self {
116            base_dir: base_dir.into(),
117        }
118    }
119
120    fn path_for(&self, filename: &str) -> PathBuf {
121        let path = Path::new(filename);
122        if path.is_absolute() {
123            path.to_path_buf()
124        } else {
125            self.base_dir.join(path)
126        }
127    }
128}
129
130impl ExternalFileResolver for FilesystemExternalFileResolver {
131    fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
132        let path = self.path_for(filename);
133        if !path.exists() {
134            return Ok(None);
135        }
136        Ok(Some(Arc::new(FileStorage::open(path)?)))
137    }
138}
139
140/// Filesystem resolver for external links. Linked files are cached after the
141/// first successful open.
142pub struct FilesystemExternalLinkResolver {
143    base_dir: PathBuf,
144    cache: parking_lot::Mutex<HashMap<PathBuf, Hdf5File>>,
145}
146
147impl FilesystemExternalLinkResolver {
148    pub fn new(base_dir: impl Into<PathBuf>) -> Self {
149        Self {
150            base_dir: base_dir.into(),
151            cache: parking_lot::Mutex::new(HashMap::new()),
152        }
153    }
154
155    fn path_for(&self, filename: &str) -> PathBuf {
156        let path = Path::new(filename);
157        if path.is_absolute() {
158            path.to_path_buf()
159        } else {
160            self.base_dir.join(path)
161        }
162    }
163}
164
165impl ExternalLinkResolver for FilesystemExternalLinkResolver {
166    fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>> {
167        let path = self.path_for(filename);
168        if !path.exists() {
169            return Ok(None);
170        }
171
172        if let Some(file) = self.cache.lock().get(&path).cloned() {
173            return Ok(Some(file));
174        }
175
176        let file = Hdf5File::open(&path)?;
177        self.cache.lock().insert(path, file.clone());
178        Ok(Some(file))
179    }
180}
181
182/// Cache for parsed object headers, keyed by file address.
183pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
184
185/// An opened HDF5 file.
186///
187/// This is the main entry point for reading HDF5 files. Storage is random-
188/// access and range-based, so metadata and data reads do not require an eager
189/// whole-file mapping.
190#[derive(Clone)]
191pub struct Hdf5File {
192    context: Arc<FileContext>,
193}
194
195pub(crate) struct FileContext {
196    pub(crate) storage: DynStorage,
197    pub(crate) superblock: Superblock,
198    pub(crate) chunk_cache: Arc<ChunkCache>,
199    pub(crate) header_cache: HeaderCache,
200    pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
201    pub(crate) filter_registry: Arc<FilterRegistry>,
202    pub(crate) external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
203    pub(crate) external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
204    pub(crate) external_file_cache: parking_lot::Mutex<HashMap<String, DynStorage>>,
205    sohm_table: OnceLock<std::result::Result<Option<SharedMessageTableRef>, String>>,
206    full_file_cache: OnceLock<StorageBuffer>,
207}
208
209impl FileContext {
210    pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
211        self.storage.read_range(offset, len)
212    }
213
214    pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
215        if let Some(buffer) = self.full_file_cache.get() {
216            return Ok(buffer.clone());
217        }
218
219        let len = usize::try_from(self.storage.len()).map_err(|_| {
220            Error::InvalidData("file size exceeds platform usize capacity".to_string())
221        })?;
222        let buffer = self.storage.read_range(0, len)?;
223        let _ = self.full_file_cache.set(buffer);
224        Ok(self
225            .full_file_cache
226            .get()
227            .expect("full-file buffer must exist after successful initialization")
228            .clone())
229    }
230
231    pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
232        {
233            let cache = self.header_cache.lock();
234            if let Some(hdr) = cache.get(&addr) {
235                return Ok(Arc::clone(hdr));
236            }
237        }
238
239        let mut hdr = ObjectHeader::parse_at_storage(
240            self.storage.as_ref(),
241            addr,
242            self.superblock.offset_size,
243            self.superblock.length_size,
244        )?;
245        hdr.resolve_shared_messages_storage_with_sohm(
246            self.storage.as_ref(),
247            self.superblock.offset_size,
248            self.superblock.length_size,
249            |heap_id, message_type| self.resolve_sohm_message(heap_id, message_type),
250        )?;
251        let arc = Arc::new(hdr);
252        let mut cache = self.header_cache.lock();
253        cache.insert(addr, Arc::clone(&arc));
254        Ok(arc)
255    }
256
257    fn resolve_sohm_message(
258        &self,
259        heap_id: &[u8],
260        message_type: u16,
261    ) -> Result<Option<HdfMessage>> {
262        let Some(table) = self.sohm_table()? else {
263            return Ok(None);
264        };
265        table.resolve_heap_message(
266            heap_id,
267            message_type,
268            self.storage.as_ref(),
269            self.superblock.offset_size,
270            self.superblock.length_size,
271            Some(self.filter_registry.as_ref()),
272        )
273    }
274
275    fn sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
276        let cached = self
277            .sohm_table
278            .get_or_init(|| self.load_sohm_table().map_err(|err| err.to_string()));
279        match cached {
280            Ok(table) => Ok(table.clone()),
281            Err(message) => Err(Error::InvalidData(format!(
282                "failed to load SOHM table: {message}"
283            ))),
284        }
285    }
286
287    fn load_sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
288        let Some(extension_address) = self.superblock.extension_address else {
289            return Ok(None);
290        };
291        let extension = ObjectHeader::parse_at_storage(
292            self.storage.as_ref(),
293            extension_address,
294            self.superblock.offset_size,
295            self.superblock.length_size,
296        )?;
297
298        let shared_table = extension.messages.iter().find_map(|message| match message {
299            HdfMessage::SharedTable(table) => Some(table),
300            _ => None,
301        });
302        let Some(shared_table) = shared_table else {
303            return Ok(None);
304        };
305
306        let table = crate::shared_message_table::SharedMessageTable::parse_at_storage(
307            self.storage.as_ref(),
308            shared_table.table_address,
309            shared_table.num_indices,
310            self.superblock.offset_size,
311        )?;
312        Ok(Some(Arc::new(table)))
313    }
314
315    pub(crate) fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
316        if let Some(storage) = self.external_file_cache.lock().get(filename).cloned() {
317            return Ok(Some(storage));
318        }
319
320        let Some(resolver) = self.external_file_resolver.as_ref() else {
321            return Ok(None);
322        };
323        let Some(storage) = resolver.resolve_external_file(filename)? else {
324            return Ok(None);
325        };
326        self.external_file_cache
327            .lock()
328            .insert(filename.to_string(), storage.clone());
329        Ok(Some(storage))
330    }
331}
332
333impl Hdf5File {
334    fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
335        let superblock = Superblock::parse_from_storage(storage.as_ref())?;
336        let cache = Arc::new(ChunkCache::new(
337            options.chunk_cache_bytes,
338            options.chunk_cache_slots,
339        ));
340        let registry = options.filter_registry.unwrap_or_default();
341        let external_file_resolver = options.external_file_resolver;
342        let external_link_resolver = options.external_link_resolver;
343
344        Ok(Hdf5File {
345            context: Arc::new(FileContext {
346                storage,
347                superblock,
348                chunk_cache: cache,
349                header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
350                dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
351                filter_registry: Arc::new(registry),
352                external_file_resolver,
353                external_link_resolver,
354                external_file_cache: parking_lot::Mutex::new(HashMap::new()),
355                sohm_table: OnceLock::new(),
356                full_file_cache: OnceLock::new(),
357            }),
358        })
359    }
360
361    /// Open an HDF5 file with default options.
362    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
363        Self::open_with_options(path, OpenOptions::default())
364    }
365
366    /// Open an HDF5 file with custom options.
367    pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
368        let path = path.as_ref();
369        let mut options = options;
370        if options.external_file_resolver.is_none() {
371            let base_dir = path
372                .parent()
373                .map(Path::to_path_buf)
374                .unwrap_or_else(|| PathBuf::from("."));
375            options.external_file_resolver =
376                Some(Arc::new(FilesystemExternalFileResolver::new(base_dir)));
377        }
378        Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
379    }
380
381    /// Open an HDF5 file from an in-memory byte slice.
382    ///
383    /// The data is copied into an owned buffer.
384    pub fn from_bytes(data: &[u8]) -> Result<Self> {
385        Self::from_bytes_with_options(data, OpenOptions::default())
386    }
387
388    /// Open an HDF5 file from an in-memory byte slice with custom options.
389    ///
390    /// The data is copied into an owned buffer.
391    pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
392        Self::from_vec_with_options(data.to_vec(), options)
393    }
394
395    /// Open an HDF5 file from an owned byte vector without copying.
396    pub fn from_vec(data: Vec<u8>) -> Result<Self> {
397        Self::from_vec_with_options(data, OpenOptions::default())
398    }
399
400    /// Open an HDF5 file from an owned byte vector with custom options.
401    pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
402        Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
403    }
404
405    /// Open an HDF5 file from an existing memory map with custom options.
406    ///
407    /// This avoids remapping when the caller already owns a read-only mapping.
408    pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
409        Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
410    }
411
412    /// Open an HDF5 file from a custom random-access storage backend.
413    pub fn from_storage(storage: DynStorage) -> Result<Self> {
414        Self::from_storage_with_options(storage, OpenOptions::default())
415    }
416
417    /// Open an HDF5 file from a custom random-access storage backend.
418    pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
419        Self::from_storage_impl(storage, options)
420    }
421
422    /// Get the parsed superblock.
423    pub fn superblock(&self) -> &Superblock {
424        &self.context.superblock
425    }
426
427    /// Access the underlying random-access storage backend.
428    pub fn storage(&self) -> &dyn Storage {
429        self.context.storage.as_ref()
430    }
431
432    /// Return current chunk-cache statistics for this file.
433    pub fn chunk_cache_stats(&self) -> ChunkCacheStats {
434        self.context.chunk_cache.stats()
435    }
436
437    /// Look up or parse an object header at the given address.
438    ///
439    /// Uses the internal cache to avoid re-parsing the same header.
440    pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
441        self.context.get_or_parse_header(addr)
442    }
443
444    /// Get the root group of the file.
445    pub fn root_group(&self) -> Result<Group> {
446        let addr = self.context.superblock.root_object_header_address()?;
447
448        Ok(Group::new(
449            self.context.clone(),
450            addr,
451            "/".to_string(),
452            addr, // root_address = self
453        ))
454    }
455
456    /// Convenience: get a dataset at a path like "/group1/dataset".
457    pub fn dataset(&self, path: &str) -> Result<Dataset> {
458        let parts: Vec<&str> = path
459            .trim_start_matches('/')
460            .split('/')
461            .filter(|s| !s.is_empty())
462            .collect();
463        let normalized_path = format!("/{}", parts.join("/"));
464
465        if parts.is_empty() {
466            return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
467        }
468
469        if let Some(template) = self
470            .context
471            .dataset_path_cache
472            .lock()
473            .get(&normalized_path)
474            .cloned()
475        {
476            return Ok(Dataset::from_template(self.context.clone(), template));
477        }
478
479        let mut group = self.root_group()?;
480        for &part in &parts[..parts.len() - 1] {
481            group = group.group(part).map_err(|e| e.with_context(path))?;
482        }
483
484        let dataset = group
485            .dataset(parts[parts.len() - 1])
486            .map_err(|e| e.with_context(path))?;
487        if Arc::ptr_eq(&dataset.context, &self.context) {
488            self.context
489                .dataset_path_cache
490                .lock()
491                .insert(normalized_path, dataset.template());
492        }
493        Ok(dataset)
494    }
495
496    /// Convenience: get a group at a path like "/group1/subgroup".
497    pub fn group(&self, path: &str) -> Result<Group> {
498        let parts: Vec<&str> = path
499            .trim_start_matches('/')
500            .split('/')
501            .filter(|s| !s.is_empty())
502            .collect();
503
504        let mut group = self.root_group()?;
505        for &part in &parts {
506            group = group.group(part)?;
507        }
508
509        Ok(group)
510    }
511}
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516
517    #[test]
518    fn test_open_options_default() {
519        let opts = OpenOptions::default();
520        assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
521        assert_eq!(opts.chunk_cache_slots, 521);
522    }
523
524    #[test]
525    fn test_invalid_file() {
526        let data = b"this is not an HDF5 file";
527        let result = Hdf5File::from_bytes(data);
528        assert!(result.is_err());
529    }
530
531    #[test]
532    fn filesystem_external_file_resolver_reads_relative_file() {
533        let dir = tempfile::tempdir().unwrap();
534        let path = dir.path().join("raw.bin");
535        std::fs::write(&path, b"abcdef").unwrap();
536
537        let resolver = FilesystemExternalFileResolver::new(dir.path());
538        let storage = resolver
539            .resolve_external_file("raw.bin")
540            .unwrap()
541            .expect("raw file should resolve");
542        let bytes = storage.read_range(2, 3).unwrap();
543        assert_eq!(bytes.as_ref(), b"cde");
544    }
545}