1pub mod checksum;
2pub mod error;
3pub mod io;
4
5pub mod superblock;
7
8pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod shared_message_table;
18pub mod symbol_table;
19
20pub mod messages;
22pub mod object_header;
23
24pub mod attribute_api;
26pub mod dataset;
27pub mod datatype_api;
28pub mod group;
29pub mod reference;
30pub mod storage;
31
32pub mod filters;
34
35pub mod cache;
37
38use std::collections::HashMap;
39use std::path::{Path, PathBuf};
40use std::sync::{Arc, OnceLock};
41
42use memmap2::Mmap;
43use cache::ChunkCache;
46use error::{Error, Result};
47use group::Group;
48use messages::HdfMessage;
49use object_header::ObjectHeader;
50use shared_message_table::SharedMessageTableRef;
51use storage::DynStorage;
52use superblock::Superblock;
53
54pub use attribute_api::Attribute;
56pub use cache::ChunkCacheStats;
57use dataset::DatasetTemplate;
58pub use dataset::{Dataset, DatasetChunk, DatasetChunkIterator, SliceInfo, SliceInfoElem};
59pub use datatype_api::{
60 dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
61 StringPadding, StringSize, VarLenKind,
62};
63pub use error::ByteOrder;
64pub use filters::FilterRegistry;
65pub use messages::datatype::Datatype;
66pub use storage::{
67 BlockCacheStats, BlockCacheStorage, BytesStorage, FileStorage, MmapStorage,
68 RangeRequestStorage, Storage, StorageBuffer,
69};
70
71pub struct OpenOptions {
73 pub chunk_cache_bytes: usize,
75 pub chunk_cache_slots: usize,
77 pub filter_registry: Option<FilterRegistry>,
79 pub external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
81 pub external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
83}
84
85impl Default for OpenOptions {
86 fn default() -> Self {
87 OpenOptions {
88 chunk_cache_bytes: 64 * 1024 * 1024,
89 chunk_cache_slots: 521,
90 filter_registry: None,
91 external_file_resolver: None,
92 external_link_resolver: None,
93 }
94 }
95}
96
97pub trait ExternalFileResolver: Send + Sync {
99 fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>>;
100}
101
102pub trait ExternalLinkResolver: Send + Sync {
104 fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>>;
105}
106
107#[derive(Debug, Clone)]
109pub struct FilesystemExternalFileResolver {
110 base_dir: PathBuf,
111}
112
113impl FilesystemExternalFileResolver {
114 pub fn new(base_dir: impl Into<PathBuf>) -> Self {
115 Self {
116 base_dir: base_dir.into(),
117 }
118 }
119
120 fn path_for(&self, filename: &str) -> PathBuf {
121 let path = Path::new(filename);
122 if path.is_absolute() {
123 path.to_path_buf()
124 } else {
125 self.base_dir.join(path)
126 }
127 }
128}
129
130impl ExternalFileResolver for FilesystemExternalFileResolver {
131 fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
132 let path = self.path_for(filename);
133 if !path.exists() {
134 return Ok(None);
135 }
136 Ok(Some(Arc::new(FileStorage::open(path)?)))
137 }
138}
139
140pub struct FilesystemExternalLinkResolver {
143 base_dir: PathBuf,
144 cache: parking_lot::Mutex<HashMap<PathBuf, Hdf5File>>,
145}
146
147impl FilesystemExternalLinkResolver {
148 pub fn new(base_dir: impl Into<PathBuf>) -> Self {
149 Self {
150 base_dir: base_dir.into(),
151 cache: parking_lot::Mutex::new(HashMap::new()),
152 }
153 }
154
155 fn path_for(&self, filename: &str) -> PathBuf {
156 let path = Path::new(filename);
157 if path.is_absolute() {
158 path.to_path_buf()
159 } else {
160 self.base_dir.join(path)
161 }
162 }
163}
164
165impl ExternalLinkResolver for FilesystemExternalLinkResolver {
166 fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>> {
167 let path = self.path_for(filename);
168 if !path.exists() {
169 return Ok(None);
170 }
171
172 if let Some(file) = self.cache.lock().get(&path).cloned() {
173 return Ok(Some(file));
174 }
175
176 let file = Hdf5File::open(&path)?;
177 self.cache.lock().insert(path, file.clone());
178 Ok(Some(file))
179 }
180}
181
182pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
184
185#[derive(Clone)]
191pub struct Hdf5File {
192 context: Arc<FileContext>,
193}
194
195pub(crate) struct FileContext {
196 pub(crate) storage: DynStorage,
197 pub(crate) superblock: Superblock,
198 pub(crate) chunk_cache: Arc<ChunkCache>,
199 pub(crate) header_cache: HeaderCache,
200 pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
201 pub(crate) filter_registry: Arc<FilterRegistry>,
202 pub(crate) external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
203 pub(crate) external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
204 pub(crate) external_file_cache: parking_lot::Mutex<HashMap<String, DynStorage>>,
205 sohm_table: OnceLock<std::result::Result<Option<SharedMessageTableRef>, String>>,
206 full_file_cache: OnceLock<StorageBuffer>,
207}
208
209impl FileContext {
210 pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
211 self.storage.read_range(offset, len)
212 }
213
214 pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
215 if let Some(buffer) = self.full_file_cache.get() {
216 return Ok(buffer.clone());
217 }
218
219 let len = usize::try_from(self.storage.len()).map_err(|_| {
220 Error::InvalidData("file size exceeds platform usize capacity".to_string())
221 })?;
222 let buffer = self.storage.read_range(0, len)?;
223 let _ = self.full_file_cache.set(buffer);
224 Ok(self
225 .full_file_cache
226 .get()
227 .expect("full-file buffer must exist after successful initialization")
228 .clone())
229 }
230
231 pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
232 {
233 let cache = self.header_cache.lock();
234 if let Some(hdr) = cache.get(&addr) {
235 return Ok(Arc::clone(hdr));
236 }
237 }
238
239 let mut hdr = ObjectHeader::parse_at_storage(
240 self.storage.as_ref(),
241 addr,
242 self.superblock.offset_size,
243 self.superblock.length_size,
244 )?;
245 hdr.resolve_shared_messages_storage_with_sohm(
246 self.storage.as_ref(),
247 self.superblock.offset_size,
248 self.superblock.length_size,
249 |heap_id, message_type| self.resolve_sohm_message(heap_id, message_type),
250 )?;
251 let arc = Arc::new(hdr);
252 let mut cache = self.header_cache.lock();
253 cache.insert(addr, Arc::clone(&arc));
254 Ok(arc)
255 }
256
257 fn resolve_sohm_message(
258 &self,
259 heap_id: &[u8],
260 message_type: u16,
261 ) -> Result<Option<HdfMessage>> {
262 let Some(table) = self.sohm_table()? else {
263 return Ok(None);
264 };
265 table.resolve_heap_message(
266 heap_id,
267 message_type,
268 self.storage.as_ref(),
269 self.superblock.offset_size,
270 self.superblock.length_size,
271 Some(self.filter_registry.as_ref()),
272 )
273 }
274
275 fn sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
276 let cached = self
277 .sohm_table
278 .get_or_init(|| self.load_sohm_table().map_err(|err| err.to_string()));
279 match cached {
280 Ok(table) => Ok(table.clone()),
281 Err(message) => Err(Error::InvalidData(format!(
282 "failed to load SOHM table: {message}"
283 ))),
284 }
285 }
286
287 fn load_sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
288 let Some(extension_address) = self.superblock.extension_address else {
289 return Ok(None);
290 };
291 let extension = ObjectHeader::parse_at_storage(
292 self.storage.as_ref(),
293 extension_address,
294 self.superblock.offset_size,
295 self.superblock.length_size,
296 )?;
297
298 let shared_table = extension.messages.iter().find_map(|message| match message {
299 HdfMessage::SharedTable(table) => Some(table),
300 _ => None,
301 });
302 let Some(shared_table) = shared_table else {
303 return Ok(None);
304 };
305
306 let table = crate::shared_message_table::SharedMessageTable::parse_at_storage(
307 self.storage.as_ref(),
308 shared_table.table_address,
309 shared_table.num_indices,
310 self.superblock.offset_size,
311 )?;
312 Ok(Some(Arc::new(table)))
313 }
314
315 pub(crate) fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
316 if let Some(storage) = self.external_file_cache.lock().get(filename).cloned() {
317 return Ok(Some(storage));
318 }
319
320 let Some(resolver) = self.external_file_resolver.as_ref() else {
321 return Ok(None);
322 };
323 let Some(storage) = resolver.resolve_external_file(filename)? else {
324 return Ok(None);
325 };
326 self.external_file_cache
327 .lock()
328 .insert(filename.to_string(), storage.clone());
329 Ok(Some(storage))
330 }
331}
332
333impl Hdf5File {
334 fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
335 let superblock = Superblock::parse_from_storage(storage.as_ref())?;
336 let cache = Arc::new(ChunkCache::new(
337 options.chunk_cache_bytes,
338 options.chunk_cache_slots,
339 ));
340 let registry = options.filter_registry.unwrap_or_default();
341 let external_file_resolver = options.external_file_resolver;
342 let external_link_resolver = options.external_link_resolver;
343
344 Ok(Hdf5File {
345 context: Arc::new(FileContext {
346 storage,
347 superblock,
348 chunk_cache: cache,
349 header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
350 dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
351 filter_registry: Arc::new(registry),
352 external_file_resolver,
353 external_link_resolver,
354 external_file_cache: parking_lot::Mutex::new(HashMap::new()),
355 sohm_table: OnceLock::new(),
356 full_file_cache: OnceLock::new(),
357 }),
358 })
359 }
360
361 pub fn open(path: impl AsRef<Path>) -> Result<Self> {
363 Self::open_with_options(path, OpenOptions::default())
364 }
365
366 pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
368 let path = path.as_ref();
369 let mut options = options;
370 if options.external_file_resolver.is_none() {
371 let base_dir = path
372 .parent()
373 .map(Path::to_path_buf)
374 .unwrap_or_else(|| PathBuf::from("."));
375 options.external_file_resolver =
376 Some(Arc::new(FilesystemExternalFileResolver::new(base_dir)));
377 }
378 Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
379 }
380
381 pub fn from_bytes(data: &[u8]) -> Result<Self> {
385 Self::from_bytes_with_options(data, OpenOptions::default())
386 }
387
388 pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
392 Self::from_vec_with_options(data.to_vec(), options)
393 }
394
395 pub fn from_vec(data: Vec<u8>) -> Result<Self> {
397 Self::from_vec_with_options(data, OpenOptions::default())
398 }
399
400 pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
402 Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
403 }
404
405 pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
409 Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
410 }
411
412 pub fn from_storage(storage: DynStorage) -> Result<Self> {
414 Self::from_storage_with_options(storage, OpenOptions::default())
415 }
416
417 pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
419 Self::from_storage_impl(storage, options)
420 }
421
422 pub fn superblock(&self) -> &Superblock {
424 &self.context.superblock
425 }
426
427 pub fn storage(&self) -> &dyn Storage {
429 self.context.storage.as_ref()
430 }
431
432 pub fn chunk_cache_stats(&self) -> ChunkCacheStats {
434 self.context.chunk_cache.stats()
435 }
436
437 pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
441 self.context.get_or_parse_header(addr)
442 }
443
444 pub fn root_group(&self) -> Result<Group> {
446 let addr = self.context.superblock.root_object_header_address()?;
447
448 Ok(Group::new(
449 self.context.clone(),
450 addr,
451 "/".to_string(),
452 addr, ))
454 }
455
456 pub fn dataset(&self, path: &str) -> Result<Dataset> {
458 let parts: Vec<&str> = path
459 .trim_start_matches('/')
460 .split('/')
461 .filter(|s| !s.is_empty())
462 .collect();
463 let normalized_path = format!("/{}", parts.join("/"));
464
465 if parts.is_empty() {
466 return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
467 }
468
469 if let Some(template) = self
470 .context
471 .dataset_path_cache
472 .lock()
473 .get(&normalized_path)
474 .cloned()
475 {
476 return Ok(Dataset::from_template(self.context.clone(), template));
477 }
478
479 let mut group = self.root_group()?;
480 for &part in &parts[..parts.len() - 1] {
481 group = group.group(part).map_err(|e| e.with_context(path))?;
482 }
483
484 let dataset = group
485 .dataset(parts[parts.len() - 1])
486 .map_err(|e| e.with_context(path))?;
487 if Arc::ptr_eq(&dataset.context, &self.context) {
488 self.context
489 .dataset_path_cache
490 .lock()
491 .insert(normalized_path, dataset.template());
492 }
493 Ok(dataset)
494 }
495
496 pub fn group(&self, path: &str) -> Result<Group> {
498 let parts: Vec<&str> = path
499 .trim_start_matches('/')
500 .split('/')
501 .filter(|s| !s.is_empty())
502 .collect();
503
504 let mut group = self.root_group()?;
505 for &part in &parts {
506 group = group.group(part)?;
507 }
508
509 Ok(group)
510 }
511}
512
513#[cfg(test)]
514mod tests {
515 use super::*;
516
517 #[test]
518 fn test_open_options_default() {
519 let opts = OpenOptions::default();
520 assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
521 assert_eq!(opts.chunk_cache_slots, 521);
522 }
523
524 #[test]
525 fn test_invalid_file() {
526 let data = b"this is not an HDF5 file";
527 let result = Hdf5File::from_bytes(data);
528 assert!(result.is_err());
529 }
530
531 #[test]
532 fn filesystem_external_file_resolver_reads_relative_file() {
533 let dir = tempfile::tempdir().unwrap();
534 let path = dir.path().join("raw.bin");
535 std::fs::write(&path, b"abcdef").unwrap();
536
537 let resolver = FilesystemExternalFileResolver::new(dir.path());
538 let storage = resolver
539 .resolve_external_file("raw.bin")
540 .unwrap()
541 .expect("raw file should resolve");
542 let bytes = storage.read_range(2, 3).unwrap();
543 assert_eq!(bytes.as_ref(), b"cde");
544 }
545}