Skip to main content

hdf5_reader/
lib.rs

1pub mod checksum;
2pub mod error;
3pub mod io;
4
5// Level 0 — File Metadata
6pub mod superblock;
7
8// Level 1 — File Infrastructure
9pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod symbol_table;
18
19// Level 2 — Data Objects
20pub mod messages;
21pub mod object_header;
22
23// High-level API
24pub mod attribute_api;
25pub mod dataset;
26pub mod datatype_api;
27pub mod group;
28pub mod reference;
29
30// Filters
31pub mod filters;
32
33// Utilities
34pub mod cache;
35
36use std::collections::HashMap;
37use std::path::Path;
38use std::sync::Arc;
39
40use memmap2::Mmap;
41// parking_lot::Mutex used via fully-qualified paths in HeaderCache and constructors.
42
43use cache::ChunkCache;
44use error::{Error, Result};
45use group::Group;
46use io::Cursor;
47use object_header::ObjectHeader;
48use superblock::Superblock;
49
50// Re-exports
51pub use attribute_api::Attribute;
52use dataset::DatasetTemplate;
53pub use dataset::{Dataset, SliceInfo, SliceInfoElem};
54pub use datatype_api::{
55    dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
56    StringPadding, StringSize,
57};
58pub use error::ByteOrder;
59pub use filters::FilterRegistry;
60pub use messages::datatype::Datatype;
61
62/// Configuration options for opening an HDF5 file.
63pub struct OpenOptions {
64    /// Maximum bytes for the chunk cache. Default: 64 MiB.
65    pub chunk_cache_bytes: usize,
66    /// Maximum number of chunk cache slots. Default: 521.
67    pub chunk_cache_slots: usize,
68    /// Custom filter registry. If `None`, the default built-in filters are used.
69    pub filter_registry: Option<FilterRegistry>,
70}
71
72impl Default for OpenOptions {
73    fn default() -> Self {
74        OpenOptions {
75            chunk_cache_bytes: 64 * 1024 * 1024,
76            chunk_cache_slots: 521,
77            filter_registry: None,
78        }
79    }
80}
81
82/// Cache for parsed object headers, keyed by file address.
83pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
84
85/// An opened HDF5 file.
86///
87/// This is the main entry point for reading HDF5 files. The file data is
88/// memory-mapped for efficient access.
89pub struct Hdf5File {
90    /// Memory-mapped file data (or owned bytes for `from_bytes`).
91    data: FileData,
92    /// Parsed superblock.
93    superblock: Superblock,
94    /// Shared chunk cache.
95    chunk_cache: Arc<ChunkCache>,
96    /// Object header cache — avoids re-parsing the same header.
97    header_cache: HeaderCache,
98    /// Dataset path cache — avoids repeated path traversal and metadata rebuilds.
99    dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
100    /// Filter registry for decompression — users can register custom filters.
101    filter_registry: Arc<FilterRegistry>,
102}
103
104enum FileData {
105    Mmap(Mmap),
106    Bytes(Vec<u8>),
107}
108
109impl FileData {
110    fn as_slice(&self) -> &[u8] {
111        match self {
112            FileData::Mmap(m) => m,
113            FileData::Bytes(b) => b,
114        }
115    }
116}
117
118impl Hdf5File {
119    /// Open an HDF5 file with default options.
120    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
121        Self::open_with_options(path, OpenOptions::default())
122    }
123
124    /// Open an HDF5 file with custom options.
125    pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
126        let file = std::fs::File::open(path.as_ref())?;
127        // SAFETY: We only read from the mapping, and the file isn't modified
128        // while we hold the mapping. The caller is responsible for not
129        // modifying the file concurrently.
130        let mmap = unsafe { Mmap::map(&file)? };
131
132        let mut cursor = Cursor::new(&mmap);
133        let superblock = Superblock::parse(&mut cursor)?;
134
135        let cache = Arc::new(ChunkCache::new(
136            options.chunk_cache_bytes,
137            options.chunk_cache_slots,
138        ));
139
140        let registry = options.filter_registry.unwrap_or_default();
141
142        Ok(Hdf5File {
143            data: FileData::Mmap(mmap),
144            superblock,
145            chunk_cache: cache,
146            header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
147            dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
148            filter_registry: Arc::new(registry),
149        })
150    }
151
152    /// Open an HDF5 file from an in-memory byte slice.
153    ///
154    /// The data is copied into an owned buffer.
155    pub fn from_bytes(data: &[u8]) -> Result<Self> {
156        Self::from_vec(data.to_vec())
157    }
158
159    /// Open an HDF5 file from an owned byte vector without copying.
160    pub fn from_vec(data: Vec<u8>) -> Result<Self> {
161        let mut cursor = Cursor::new(&data);
162        let superblock = Superblock::parse(&mut cursor)?;
163
164        Ok(Hdf5File {
165            data: FileData::Bytes(data),
166            superblock,
167            chunk_cache: Arc::new(ChunkCache::default()),
168            header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
169            dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
170            filter_registry: Arc::new(FilterRegistry::default()),
171        })
172    }
173
174    /// Get the parsed superblock.
175    pub fn superblock(&self) -> &Superblock {
176        &self.superblock
177    }
178
179    /// Look up or parse an object header at the given address.
180    ///
181    /// Uses the internal cache to avoid re-parsing the same header.
182    pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
183        {
184            let cache = self.header_cache.lock();
185            if let Some(hdr) = cache.get(&addr) {
186                return Ok(Arc::clone(hdr));
187            }
188        }
189        let data = self.data.as_slice();
190        let mut hdr = ObjectHeader::parse_at(
191            data,
192            addr,
193            self.superblock.offset_size,
194            self.superblock.length_size,
195        )?;
196        hdr.resolve_shared_messages(
197            data,
198            self.superblock.offset_size,
199            self.superblock.length_size,
200        )?;
201        let arc = Arc::new(hdr);
202        let mut cache = self.header_cache.lock();
203        cache.insert(addr, Arc::clone(&arc));
204        Ok(arc)
205    }
206
207    /// Get the root group of the file.
208    pub fn root_group(&self) -> Result<Group<'_>> {
209        let data = self.data.as_slice();
210        let addr = self.superblock.root_object_header_address()?;
211
212        Ok(Group::new(
213            data,
214            addr,
215            "/".to_string(),
216            self.superblock.offset_size,
217            self.superblock.length_size,
218            addr, // root_address = self
219            self.chunk_cache.clone(),
220            self.header_cache.clone(),
221            self.filter_registry.clone(),
222        ))
223    }
224
225    /// Convenience: get a dataset at a path like "/group1/dataset".
226    pub fn dataset(&self, path: &str) -> Result<Dataset<'_>> {
227        let parts: Vec<&str> = path
228            .trim_start_matches('/')
229            .split('/')
230            .filter(|s| !s.is_empty())
231            .collect();
232        let normalized_path = format!("/{}", parts.join("/"));
233
234        if parts.is_empty() {
235            return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
236        }
237
238        if let Some(template) = self
239            .dataset_path_cache
240            .lock()
241            .get(&normalized_path)
242            .cloned()
243        {
244            return Ok(Dataset::from_template(
245                self.data.as_slice(),
246                self.superblock.offset_size,
247                self.superblock.length_size,
248                template,
249                self.chunk_cache.clone(),
250                self.filter_registry.clone(),
251            ));
252        }
253
254        let mut group = self.root_group()?;
255        for &part in &parts[..parts.len() - 1] {
256            group = group.group(part).map_err(|e| e.with_context(path))?;
257        }
258
259        let dataset = group
260            .dataset(parts[parts.len() - 1])
261            .map_err(|e| e.with_context(path))?;
262        self.dataset_path_cache
263            .lock()
264            .insert(normalized_path, dataset.template());
265        Ok(dataset)
266    }
267
268    /// Convenience: get a group at a path like "/group1/subgroup".
269    pub fn group(&self, path: &str) -> Result<Group<'_>> {
270        let parts: Vec<&str> = path
271            .trim_start_matches('/')
272            .split('/')
273            .filter(|s| !s.is_empty())
274            .collect();
275
276        let mut group = self.root_group()?;
277        for &part in &parts {
278            group = group.group(part)?;
279        }
280
281        Ok(group)
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    #[test]
290    fn test_open_options_default() {
291        let opts = OpenOptions::default();
292        assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
293        assert_eq!(opts.chunk_cache_slots, 521);
294    }
295
296    #[test]
297    fn test_invalid_file() {
298        let data = b"this is not an HDF5 file";
299        let result = Hdf5File::from_bytes(data);
300        assert!(result.is_err());
301    }
302}