Skip to main content

hdf5_reader/
lib.rs

1pub mod checksum;
2pub mod error;
3pub mod io;
4
5// Level 0 — File Metadata
6pub mod superblock;
7
8// Level 1 — File Infrastructure
9pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod shared_message_table;
18pub mod symbol_table;
19
20// Level 2 — Data Objects
21pub mod messages;
22pub mod object_header;
23
24// High-level API
25pub mod attribute_api;
26pub mod dataset;
27pub mod datatype_api;
28pub mod group;
29pub mod reference;
30pub mod storage;
31
32// Filters
33pub mod filters;
34
35// Utilities
36pub mod cache;
37
38use std::collections::HashMap;
39use std::io::ErrorKind;
40use std::path::{Component, Path, PathBuf};
41use std::sync::{Arc, OnceLock};
42
43use memmap2::Mmap;
44// parking_lot::Mutex used via fully-qualified paths in HeaderCache and constructors.
45
46use cache::ChunkCache;
47use error::{Error, Result};
48use group::Group;
49use messages::HdfMessage;
50use object_header::ObjectHeader;
51use shared_message_table::SharedMessageTableRef;
52use storage::DynStorage;
53use superblock::Superblock;
54
55// Re-exports
56pub use attribute_api::Attribute;
57pub use cache::ChunkCacheStats;
58use dataset::DatasetTemplate;
59pub use dataset::{Dataset, DatasetChunk, DatasetChunkIterator, SliceInfo, SliceInfoElem};
60pub use datatype_api::{
61    dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
62    StringPadding, StringSize, VarLenKind,
63};
64pub use error::ByteOrder;
65pub use filters::FilterRegistry;
66pub use messages::datatype::Datatype;
67pub use storage::{
68    BlockCacheStats, BlockCacheStorage, BytesStorage, FileStorage, MmapStorage,
69    RangeRequestStorage, Storage, StorageBuffer,
70};
71
72/// Configuration options for opening an HDF5 file.
73pub struct OpenOptions {
74    /// Maximum bytes for the chunk cache. Default: 64 MiB.
75    pub chunk_cache_bytes: usize,
76    /// Maximum number of chunk cache slots. Default: 521.
77    pub chunk_cache_slots: usize,
78    /// Custom filter registry. If `None`, the default built-in filters are used.
79    pub filter_registry: Option<FilterRegistry>,
80    /// Resolver for HDF5 external raw data files. If `None`, external raw data
81    /// files are not resolved.
82    pub external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
83    /// Optional resolver for HDF5 external links.
84    pub external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
85}
86
87impl Default for OpenOptions {
88    fn default() -> Self {
89        OpenOptions {
90            chunk_cache_bytes: 64 * 1024 * 1024,
91            chunk_cache_slots: 521,
92            filter_registry: None,
93            external_file_resolver: None,
94            external_link_resolver: None,
95        }
96    }
97}
98
99/// Resolves file names from HDF5 External Data Files messages to storage.
100pub trait ExternalFileResolver: Send + Sync {
101    fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>>;
102}
103
104/// Resolves HDF5 external links to another opened file.
105pub trait ExternalLinkResolver: Send + Sync {
106    fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>>;
107}
108
109fn resolve_path_within_base(
110    base_dir: &Path,
111    filename: &str,
112    description: &str,
113) -> Result<Option<PathBuf>> {
114    let path = Path::new(filename);
115    if path.as_os_str().is_empty() {
116        return Err(Error::InvalidData(format!("{description} path is empty")));
117    }
118
119    if path.is_absolute() {
120        return Err(Error::InvalidData(format!(
121            "{description} path must be relative: {filename}"
122        )));
123    }
124
125    if path.components().any(|component| {
126        matches!(
127            component,
128            Component::Prefix(_) | Component::RootDir | Component::ParentDir
129        )
130    }) {
131        return Err(Error::InvalidData(format!(
132            "{description} path must stay within the resolver base directory: {filename}"
133        )));
134    }
135
136    let base = match base_dir.canonicalize() {
137        Ok(path) => path,
138        Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
139        Err(err) => return Err(err.into()),
140    };
141    let candidate = base.join(path);
142    let resolved = match candidate.canonicalize() {
143        Ok(path) => path,
144        Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
145        Err(err) => return Err(err.into()),
146    };
147
148    if !resolved.starts_with(&base) {
149        return Err(Error::InvalidData(format!(
150            "{description} path escapes the resolver base directory: {filename}"
151        )));
152    }
153
154    Ok(Some(resolved))
155}
156
157/// Filesystem resolver for external raw data files.
158#[derive(Debug, Clone)]
159pub struct FilesystemExternalFileResolver {
160    base_dir: PathBuf,
161}
162
163impl FilesystemExternalFileResolver {
164    pub fn new(base_dir: impl Into<PathBuf>) -> Self {
165        Self {
166            base_dir: base_dir.into(),
167        }
168    }
169
170    fn path_for(&self, filename: &str) -> Result<Option<PathBuf>> {
171        resolve_path_within_base(&self.base_dir, filename, "external raw data file")
172    }
173}
174
175impl ExternalFileResolver for FilesystemExternalFileResolver {
176    fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
177        let Some(path) = self.path_for(filename)? else {
178            return Ok(None);
179        };
180        Ok(Some(Arc::new(FileStorage::open(path)?)))
181    }
182}
183
184/// Filesystem resolver for external links. Linked files are cached after the
185/// first successful open.
186pub struct FilesystemExternalLinkResolver {
187    base_dir: PathBuf,
188    cache: parking_lot::Mutex<HashMap<PathBuf, Hdf5File>>,
189}
190
191impl FilesystemExternalLinkResolver {
192    pub fn new(base_dir: impl Into<PathBuf>) -> Self {
193        Self {
194            base_dir: base_dir.into(),
195            cache: parking_lot::Mutex::new(HashMap::new()),
196        }
197    }
198
199    fn path_for(&self, filename: &str) -> Result<Option<PathBuf>> {
200        resolve_path_within_base(&self.base_dir, filename, "external link")
201    }
202}
203
204impl ExternalLinkResolver for FilesystemExternalLinkResolver {
205    fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>> {
206        let Some(path) = self.path_for(filename)? else {
207            return Ok(None);
208        };
209
210        if let Some(file) = self.cache.lock().get(&path).cloned() {
211            return Ok(Some(file));
212        }
213
214        let file = Hdf5File::open(&path)?;
215        self.cache.lock().insert(path, file.clone());
216        Ok(Some(file))
217    }
218}
219
220/// Cache for parsed object headers, keyed by file address.
221pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
222
223/// An opened HDF5 file.
224///
225/// This is the main entry point for reading HDF5 files. Storage is random-
226/// access and range-based, so metadata and data reads do not require an eager
227/// whole-file mapping.
228#[derive(Clone)]
229pub struct Hdf5File {
230    context: Arc<FileContext>,
231}
232
233pub(crate) struct FileContext {
234    pub(crate) storage: DynStorage,
235    pub(crate) superblock: Superblock,
236    pub(crate) chunk_cache: Arc<ChunkCache>,
237    pub(crate) header_cache: HeaderCache,
238    pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
239    pub(crate) filter_registry: Arc<FilterRegistry>,
240    pub(crate) external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
241    pub(crate) external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
242    pub(crate) external_file_cache: parking_lot::Mutex<HashMap<String, DynStorage>>,
243    sohm_table: OnceLock<std::result::Result<Option<SharedMessageTableRef>, String>>,
244    full_file_cache: OnceLock<StorageBuffer>,
245}
246
247impl FileContext {
248    pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
249        self.storage.read_range(offset, len)
250    }
251
252    pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
253        if let Some(buffer) = self.full_file_cache.get() {
254            return Ok(buffer.clone());
255        }
256
257        let len = usize::try_from(self.storage.len()).map_err(|_| {
258            Error::InvalidData("file size exceeds platform usize capacity".to_string())
259        })?;
260        let buffer = self.storage.read_range(0, len)?;
261        let _ = self.full_file_cache.set(buffer);
262        Ok(self
263            .full_file_cache
264            .get()
265            .expect("full-file buffer must exist after successful initialization")
266            .clone())
267    }
268
269    pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
270        {
271            let cache = self.header_cache.lock();
272            if let Some(hdr) = cache.get(&addr) {
273                return Ok(Arc::clone(hdr));
274            }
275        }
276
277        let mut hdr = ObjectHeader::parse_at_storage(
278            self.storage.as_ref(),
279            addr,
280            self.superblock.offset_size,
281            self.superblock.length_size,
282        )?;
283        hdr.resolve_shared_messages_storage_with_sohm(
284            self.storage.as_ref(),
285            self.superblock.offset_size,
286            self.superblock.length_size,
287            |heap_id, message_type| self.resolve_sohm_message(heap_id, message_type),
288        )?;
289        let arc = Arc::new(hdr);
290        let mut cache = self.header_cache.lock();
291        cache.insert(addr, Arc::clone(&arc));
292        Ok(arc)
293    }
294
295    fn resolve_sohm_message(
296        &self,
297        heap_id: &[u8],
298        message_type: u16,
299    ) -> Result<Option<HdfMessage>> {
300        let Some(table) = self.sohm_table()? else {
301            return Ok(None);
302        };
303        table.resolve_heap_message(
304            heap_id,
305            message_type,
306            self.storage.as_ref(),
307            self.superblock.offset_size,
308            self.superblock.length_size,
309            Some(self.filter_registry.as_ref()),
310        )
311    }
312
313    fn sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
314        let cached = self
315            .sohm_table
316            .get_or_init(|| self.load_sohm_table().map_err(|err| err.to_string()));
317        match cached {
318            Ok(table) => Ok(table.clone()),
319            Err(message) => Err(Error::InvalidData(format!(
320                "failed to load SOHM table: {message}"
321            ))),
322        }
323    }
324
325    fn load_sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
326        let Some(extension_address) = self.superblock.extension_address else {
327            return Ok(None);
328        };
329        let extension = ObjectHeader::parse_at_storage(
330            self.storage.as_ref(),
331            extension_address,
332            self.superblock.offset_size,
333            self.superblock.length_size,
334        )?;
335
336        let shared_table = extension.messages.iter().find_map(|message| match message {
337            HdfMessage::SharedTable(table) => Some(table),
338            _ => None,
339        });
340        let Some(shared_table) = shared_table else {
341            return Ok(None);
342        };
343
344        let table = crate::shared_message_table::SharedMessageTable::parse_at_storage(
345            self.storage.as_ref(),
346            shared_table.table_address,
347            shared_table.num_indices,
348            self.superblock.offset_size,
349        )?;
350        Ok(Some(Arc::new(table)))
351    }
352
353    pub(crate) fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
354        if let Some(storage) = self.external_file_cache.lock().get(filename).cloned() {
355            return Ok(Some(storage));
356        }
357
358        let Some(resolver) = self.external_file_resolver.as_ref() else {
359            return Ok(None);
360        };
361        let Some(storage) = resolver.resolve_external_file(filename)? else {
362            return Ok(None);
363        };
364        self.external_file_cache
365            .lock()
366            .insert(filename.to_string(), storage.clone());
367        Ok(Some(storage))
368    }
369}
370
371impl Hdf5File {
372    fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
373        let superblock = Superblock::parse_from_storage(storage.as_ref())?;
374        let cache = Arc::new(ChunkCache::new(
375            options.chunk_cache_bytes,
376            options.chunk_cache_slots,
377        ));
378        let registry = options.filter_registry.unwrap_or_default();
379        let external_file_resolver = options.external_file_resolver;
380        let external_link_resolver = options.external_link_resolver;
381
382        Ok(Hdf5File {
383            context: Arc::new(FileContext {
384                storage,
385                superblock,
386                chunk_cache: cache,
387                header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
388                dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
389                filter_registry: Arc::new(registry),
390                external_file_resolver,
391                external_link_resolver,
392                external_file_cache: parking_lot::Mutex::new(HashMap::new()),
393                sohm_table: OnceLock::new(),
394                full_file_cache: OnceLock::new(),
395            }),
396        })
397    }
398
399    /// Open an HDF5 file with default options.
400    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
401        Self::open_with_options(path, OpenOptions::default())
402    }
403
404    /// Open an HDF5 file with custom options.
405    pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
406        let path = path.as_ref();
407        Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
408    }
409
410    /// Open an HDF5 file from an in-memory byte slice.
411    ///
412    /// The data is copied into an owned buffer.
413    pub fn from_bytes(data: &[u8]) -> Result<Self> {
414        Self::from_bytes_with_options(data, OpenOptions::default())
415    }
416
417    /// Open an HDF5 file from an in-memory byte slice with custom options.
418    ///
419    /// The data is copied into an owned buffer.
420    pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
421        Self::from_vec_with_options(data.to_vec(), options)
422    }
423
424    /// Open an HDF5 file from an owned byte vector without copying.
425    pub fn from_vec(data: Vec<u8>) -> Result<Self> {
426        Self::from_vec_with_options(data, OpenOptions::default())
427    }
428
429    /// Open an HDF5 file from an owned byte vector with custom options.
430    pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
431        Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
432    }
433
434    /// Open an HDF5 file from an existing memory map with custom options.
435    ///
436    /// This avoids remapping when the caller already owns a read-only mapping.
437    pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
438        Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
439    }
440
441    /// Open an HDF5 file from a custom random-access storage backend.
442    pub fn from_storage(storage: DynStorage) -> Result<Self> {
443        Self::from_storage_with_options(storage, OpenOptions::default())
444    }
445
446    /// Open an HDF5 file from a custom random-access storage backend.
447    pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
448        Self::from_storage_impl(storage, options)
449    }
450
451    /// Get the parsed superblock.
452    pub fn superblock(&self) -> &Superblock {
453        &self.context.superblock
454    }
455
456    /// Access the underlying random-access storage backend.
457    pub fn storage(&self) -> &dyn Storage {
458        self.context.storage.as_ref()
459    }
460
461    /// Return current chunk-cache statistics for this file.
462    pub fn chunk_cache_stats(&self) -> ChunkCacheStats {
463        self.context.chunk_cache.stats()
464    }
465
466    /// Look up or parse an object header at the given address.
467    ///
468    /// Uses the internal cache to avoid re-parsing the same header.
469    pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
470        self.context.get_or_parse_header(addr)
471    }
472
473    /// Get the root group of the file.
474    pub fn root_group(&self) -> Result<Group> {
475        let addr = self.context.superblock.root_object_header_address()?;
476
477        Ok(Group::new(
478            self.context.clone(),
479            addr,
480            "/".to_string(),
481            addr, // root_address = self
482        ))
483    }
484
485    /// Convenience: get a dataset at a path like "/group1/dataset".
486    pub fn dataset(&self, path: &str) -> Result<Dataset> {
487        let parts: Vec<&str> = path
488            .trim_start_matches('/')
489            .split('/')
490            .filter(|s| !s.is_empty())
491            .collect();
492        let normalized_path = format!("/{}", parts.join("/"));
493
494        if parts.is_empty() {
495            return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
496        }
497
498        if let Some(template) = self
499            .context
500            .dataset_path_cache
501            .lock()
502            .get(&normalized_path)
503            .cloned()
504        {
505            return Ok(Dataset::from_template(self.context.clone(), template));
506        }
507
508        let mut group = self.root_group()?;
509        for &part in &parts[..parts.len() - 1] {
510            group = group.group(part).map_err(|e| e.with_context(path))?;
511        }
512
513        let dataset = group
514            .dataset(parts[parts.len() - 1])
515            .map_err(|e| e.with_context(path))?;
516        if Arc::ptr_eq(&dataset.context, &self.context) {
517            self.context
518                .dataset_path_cache
519                .lock()
520                .insert(normalized_path, dataset.template());
521        }
522        Ok(dataset)
523    }
524
525    /// Convenience: get a group at a path like "/group1/subgroup".
526    pub fn group(&self, path: &str) -> Result<Group> {
527        let parts: Vec<&str> = path
528            .trim_start_matches('/')
529            .split('/')
530            .filter(|s| !s.is_empty())
531            .collect();
532
533        let mut group = self.root_group()?;
534        for &part in &parts {
535            group = group.group(part)?;
536        }
537
538        Ok(group)
539    }
540}
541
542#[cfg(test)]
543mod tests {
544    use super::*;
545
546    #[test]
547    fn open_options_default() {
548        let opts = OpenOptions::default();
549        assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
550        assert_eq!(opts.chunk_cache_slots, 521);
551        assert!(opts.external_file_resolver.is_none());
552    }
553
554    #[test]
555    fn invalid_file() {
556        let data = b"this is not an HDF5 file";
557        let result = Hdf5File::from_bytes(data);
558        assert!(result.is_err());
559    }
560
561    #[test]
562    fn filesystem_external_file_resolver_reads_relative_file() {
563        let dir = tempfile::tempdir().unwrap();
564        let path = dir.path().join("raw.bin");
565        std::fs::write(&path, b"abcdef").unwrap();
566
567        let resolver = FilesystemExternalFileResolver::new(dir.path());
568        let storage = resolver
569            .resolve_external_file("raw.bin")
570            .unwrap()
571            .expect("raw file should resolve");
572        let bytes = storage.read_range(2, 3).unwrap();
573        assert_eq!(bytes.as_ref(), b"cde");
574    }
575
576    #[test]
577    fn filesystem_external_file_resolver_rejects_absolute_path() {
578        let dir = tempfile::tempdir().unwrap();
579        let path = dir.path().join("raw.bin");
580        std::fs::write(&path, b"abcdef").unwrap();
581
582        let resolver = FilesystemExternalFileResolver::new(dir.path());
583        let err = match resolver.resolve_external_file(path.to_str().unwrap()) {
584            Ok(_) => panic!("absolute external file path should be rejected"),
585            Err(err) => err,
586        };
587        assert!(err.to_string().contains("must be relative"));
588    }
589
590    #[test]
591    fn filesystem_external_file_resolver_rejects_parent_component() {
592        let dir = tempfile::tempdir().unwrap();
593        let resolver = FilesystemExternalFileResolver::new(dir.path());
594
595        let err = match resolver.resolve_external_file("../raw.bin") {
596            Ok(_) => panic!("parent external file path should be rejected"),
597            Err(err) => err,
598        };
599        assert!(err.to_string().contains("resolver base directory"));
600    }
601
602    #[cfg(unix)]
603    #[test]
604    fn filesystem_external_file_resolver_rejects_symlink_escape() {
605        use std::os::unix::fs::symlink;
606
607        let dir = tempfile::tempdir().unwrap();
608        let outside = tempfile::tempdir().unwrap();
609        let outside_path = outside.path().join("raw.bin");
610        std::fs::write(&outside_path, b"abcdef").unwrap();
611        symlink(&outside_path, dir.path().join("raw.bin")).unwrap();
612
613        let resolver = FilesystemExternalFileResolver::new(dir.path());
614        let err = match resolver.resolve_external_file("raw.bin") {
615            Ok(_) => panic!("symlink escape should be rejected"),
616            Err(err) => err,
617        };
618        assert!(err.to_string().contains("escapes"));
619    }
620
621    #[test]
622    fn filesystem_external_link_resolver_rejects_absolute_path() {
623        let dir = tempfile::tempdir().unwrap();
624        let path = dir.path().join("linked.h5");
625        std::fs::write(&path, b"not really hdf5").unwrap();
626
627        let resolver = FilesystemExternalLinkResolver::new(dir.path());
628        let err = match resolver.resolve_external_link(path.to_str().unwrap()) {
629            Ok(_) => panic!("absolute external link path should be rejected"),
630            Err(err) => err,
631        };
632        assert!(err.to_string().contains("must be relative"));
633    }
634
635    #[test]
636    fn filesystem_external_link_resolver_rejects_parent_component() {
637        let dir = tempfile::tempdir().unwrap();
638        let resolver = FilesystemExternalLinkResolver::new(dir.path());
639
640        let err = match resolver.resolve_external_link("../linked.h5") {
641            Ok(_) => panic!("parent external link path should be rejected"),
642            Err(err) => err,
643        };
644        assert!(err.to_string().contains("resolver base directory"));
645    }
646
647    #[cfg(unix)]
648    #[test]
649    fn filesystem_external_link_resolver_rejects_symlink_escape() {
650        use std::os::unix::fs::symlink;
651
652        let dir = tempfile::tempdir().unwrap();
653        let outside = tempfile::tempdir().unwrap();
654        let outside_path = outside.path().join("linked.h5");
655        std::fs::write(&outside_path, b"not really hdf5").unwrap();
656        symlink(&outside_path, dir.path().join("linked.h5")).unwrap();
657
658        let resolver = FilesystemExternalLinkResolver::new(dir.path());
659        let err = match resolver.resolve_external_link("linked.h5") {
660            Ok(_) => panic!("symlink escape should be rejected"),
661            Err(err) => err,
662        };
663        assert!(err.to_string().contains("escapes"));
664    }
665}