Skip to main content

hdf5_reader/
lib.rs

1pub mod checksum;
2pub mod error;
3pub mod io;
4
5// Level 0 — File Metadata
6pub mod superblock;
7
8// Level 1 — File Infrastructure
9pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod symbol_table;
18
19// Level 2 — Data Objects
20pub mod messages;
21pub mod object_header;
22
23// High-level API
24pub mod attribute_api;
25pub mod dataset;
26pub mod datatype_api;
27pub mod group;
28pub mod reference;
29
30// Filters
31pub mod filters;
32
33// Utilities
34pub mod cache;
35
36use std::collections::HashMap;
37use std::path::Path;
38use std::sync::Arc;
39
40use memmap2::Mmap;
41// parking_lot::Mutex used via fully-qualified paths in HeaderCache and constructors.
42
43use cache::ChunkCache;
44use error::{Error, Result};
45use group::Group;
46use io::Cursor;
47use object_header::ObjectHeader;
48use superblock::Superblock;
49
50// Re-exports
51pub use attribute_api::Attribute;
52use dataset::DatasetTemplate;
53pub use dataset::{Dataset, SliceInfo, SliceInfoElem};
54pub use datatype_api::{
55    dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
56    StringPadding, StringSize,
57};
58pub use error::ByteOrder;
59pub use filters::FilterRegistry;
60pub use messages::datatype::Datatype;
61
62/// Configuration options for opening an HDF5 file.
63pub struct OpenOptions {
64    /// Maximum bytes for the chunk cache. Default: 64 MiB.
65    pub chunk_cache_bytes: usize,
66    /// Maximum number of chunk cache slots. Default: 521.
67    pub chunk_cache_slots: usize,
68    /// Custom filter registry. If `None`, the default built-in filters are used.
69    pub filter_registry: Option<FilterRegistry>,
70}
71
72impl Default for OpenOptions {
73    fn default() -> Self {
74        OpenOptions {
75            chunk_cache_bytes: 64 * 1024 * 1024,
76            chunk_cache_slots: 521,
77            filter_registry: None,
78        }
79    }
80}
81
82/// Cache for parsed object headers, keyed by file address.
83pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
84
85/// An opened HDF5 file.
86///
87/// This is the main entry point for reading HDF5 files. The file data is
88/// memory-mapped for efficient access.
89pub struct Hdf5File {
90    /// Memory-mapped file data (or owned bytes for `from_bytes`).
91    data: FileData,
92    /// Parsed superblock.
93    superblock: Superblock,
94    /// Shared chunk cache.
95    chunk_cache: Arc<ChunkCache>,
96    /// Object header cache — avoids re-parsing the same header.
97    header_cache: HeaderCache,
98    /// Dataset path cache — avoids repeated path traversal and metadata rebuilds.
99    dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
100    /// Filter registry for decompression — users can register custom filters.
101    filter_registry: Arc<FilterRegistry>,
102}
103
104enum FileData {
105    Mmap(Mmap),
106    Bytes(Vec<u8>),
107}
108
109impl FileData {
110    fn as_slice(&self) -> &[u8] {
111        match self {
112            FileData::Mmap(m) => m,
113            FileData::Bytes(b) => b,
114        }
115    }
116}
117
118impl Hdf5File {
119    fn from_file_data(data: FileData, options: OpenOptions) -> Result<Self> {
120        let mut cursor = Cursor::new(data.as_slice());
121        let superblock = Superblock::parse(&mut cursor)?;
122        let cache = Arc::new(ChunkCache::new(
123            options.chunk_cache_bytes,
124            options.chunk_cache_slots,
125        ));
126        let registry = options.filter_registry.unwrap_or_default();
127
128        Ok(Hdf5File {
129            data,
130            superblock,
131            chunk_cache: cache,
132            header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
133            dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
134            filter_registry: Arc::new(registry),
135        })
136    }
137
138    /// Open an HDF5 file with default options.
139    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
140        Self::open_with_options(path, OpenOptions::default())
141    }
142
143    /// Open an HDF5 file with custom options.
144    pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
145        let file = std::fs::File::open(path.as_ref())?;
146        // SAFETY: We only read from the mapping, and the file isn't modified
147        // while we hold the mapping. The caller is responsible for not
148        // modifying the file concurrently.
149        let mmap = unsafe { Mmap::map(&file)? };
150        Self::from_mmap_with_options(mmap, options)
151    }
152
153    /// Open an HDF5 file from an in-memory byte slice.
154    ///
155    /// The data is copied into an owned buffer.
156    pub fn from_bytes(data: &[u8]) -> Result<Self> {
157        Self::from_bytes_with_options(data, OpenOptions::default())
158    }
159
160    /// Open an HDF5 file from an in-memory byte slice with custom options.
161    ///
162    /// The data is copied into an owned buffer.
163    pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
164        Self::from_vec_with_options(data.to_vec(), options)
165    }
166
167    /// Open an HDF5 file from an owned byte vector without copying.
168    pub fn from_vec(data: Vec<u8>) -> Result<Self> {
169        Self::from_vec_with_options(data, OpenOptions::default())
170    }
171
172    /// Open an HDF5 file from an owned byte vector with custom options.
173    pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
174        Self::from_file_data(FileData::Bytes(data), options)
175    }
176
177    /// Open an HDF5 file from an existing memory map with custom options.
178    ///
179    /// This avoids remapping when the caller already owns a read-only mapping.
180    pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
181        Self::from_file_data(FileData::Mmap(mmap), options)
182    }
183
184    /// Get the parsed superblock.
185    pub fn superblock(&self) -> &Superblock {
186        &self.superblock
187    }
188
189    /// Look up or parse an object header at the given address.
190    ///
191    /// Uses the internal cache to avoid re-parsing the same header.
192    pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
193        {
194            let cache = self.header_cache.lock();
195            if let Some(hdr) = cache.get(&addr) {
196                return Ok(Arc::clone(hdr));
197            }
198        }
199        let data = self.data.as_slice();
200        let mut hdr = ObjectHeader::parse_at(
201            data,
202            addr,
203            self.superblock.offset_size,
204            self.superblock.length_size,
205        )?;
206        hdr.resolve_shared_messages(
207            data,
208            self.superblock.offset_size,
209            self.superblock.length_size,
210        )?;
211        let arc = Arc::new(hdr);
212        let mut cache = self.header_cache.lock();
213        cache.insert(addr, Arc::clone(&arc));
214        Ok(arc)
215    }
216
217    /// Get the root group of the file.
218    pub fn root_group(&self) -> Result<Group<'_>> {
219        let data = self.data.as_slice();
220        let addr = self.superblock.root_object_header_address()?;
221
222        Ok(Group::new(
223            data,
224            addr,
225            "/".to_string(),
226            self.superblock.offset_size,
227            self.superblock.length_size,
228            addr, // root_address = self
229            self.chunk_cache.clone(),
230            self.header_cache.clone(),
231            self.filter_registry.clone(),
232        ))
233    }
234
235    /// Convenience: get a dataset at a path like "/group1/dataset".
236    pub fn dataset(&self, path: &str) -> Result<Dataset<'_>> {
237        let parts: Vec<&str> = path
238            .trim_start_matches('/')
239            .split('/')
240            .filter(|s| !s.is_empty())
241            .collect();
242        let normalized_path = format!("/{}", parts.join("/"));
243
244        if parts.is_empty() {
245            return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
246        }
247
248        if let Some(template) = self
249            .dataset_path_cache
250            .lock()
251            .get(&normalized_path)
252            .cloned()
253        {
254            return Ok(Dataset::from_template(
255                self.data.as_slice(),
256                self.superblock.offset_size,
257                self.superblock.length_size,
258                template,
259                self.chunk_cache.clone(),
260                self.filter_registry.clone(),
261            ));
262        }
263
264        let mut group = self.root_group()?;
265        for &part in &parts[..parts.len() - 1] {
266            group = group.group(part).map_err(|e| e.with_context(path))?;
267        }
268
269        let dataset = group
270            .dataset(parts[parts.len() - 1])
271            .map_err(|e| e.with_context(path))?;
272        self.dataset_path_cache
273            .lock()
274            .insert(normalized_path, dataset.template());
275        Ok(dataset)
276    }
277
278    /// Convenience: get a group at a path like "/group1/subgroup".
279    pub fn group(&self, path: &str) -> Result<Group<'_>> {
280        let parts: Vec<&str> = path
281            .trim_start_matches('/')
282            .split('/')
283            .filter(|s| !s.is_empty())
284            .collect();
285
286        let mut group = self.root_group()?;
287        for &part in &parts {
288            group = group.group(part)?;
289        }
290
291        Ok(group)
292    }
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298
299    #[test]
300    fn test_open_options_default() {
301        let opts = OpenOptions::default();
302        assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
303        assert_eq!(opts.chunk_cache_slots, 521);
304    }
305
306    #[test]
307    fn test_invalid_file() {
308        let data = b"this is not an HDF5 file";
309        let result = Hdf5File::from_bytes(data);
310        assert!(result.is_err());
311    }
312}