1pub mod checksum;
2pub mod error;
3pub mod io;
4
5pub mod superblock;
7
8pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod shared_message_table;
18pub mod symbol_table;
19
20pub mod messages;
22pub mod object_header;
23
24pub mod attribute_api;
26pub mod dataset;
27pub mod datatype_api;
28pub mod group;
29pub mod reference;
30pub mod storage;
31
32pub mod filters;
34
35pub mod cache;
37
38use std::collections::HashMap;
39use std::path::{Path, PathBuf};
40use std::sync::{Arc, OnceLock};
41
42use memmap2::Mmap;
43use cache::ChunkCache;
46use error::{Error, Result};
47use group::Group;
48use messages::HdfMessage;
49use object_header::ObjectHeader;
50use shared_message_table::SharedMessageTableRef;
51use storage::DynStorage;
52use superblock::Superblock;
53
54pub use attribute_api::Attribute;
56use dataset::DatasetTemplate;
57pub use dataset::{Dataset, SliceInfo, SliceInfoElem};
58pub use datatype_api::{
59 dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
60 StringPadding, StringSize,
61};
62pub use error::ByteOrder;
63pub use filters::FilterRegistry;
64pub use messages::datatype::Datatype;
65pub use storage::{BytesStorage, FileStorage, MmapStorage, Storage, StorageBuffer};
66
67pub struct OpenOptions {
69 pub chunk_cache_bytes: usize,
71 pub chunk_cache_slots: usize,
73 pub filter_registry: Option<FilterRegistry>,
75 pub external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
77 pub external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
79}
80
81impl Default for OpenOptions {
82 fn default() -> Self {
83 OpenOptions {
84 chunk_cache_bytes: 64 * 1024 * 1024,
85 chunk_cache_slots: 521,
86 filter_registry: None,
87 external_file_resolver: None,
88 external_link_resolver: None,
89 }
90 }
91}
92
93pub trait ExternalFileResolver: Send + Sync {
95 fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>>;
96}
97
98pub trait ExternalLinkResolver: Send + Sync {
100 fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>>;
101}
102
103#[derive(Debug, Clone)]
105pub struct FilesystemExternalFileResolver {
106 base_dir: PathBuf,
107}
108
109impl FilesystemExternalFileResolver {
110 pub fn new(base_dir: impl Into<PathBuf>) -> Self {
111 Self {
112 base_dir: base_dir.into(),
113 }
114 }
115
116 fn path_for(&self, filename: &str) -> PathBuf {
117 let path = Path::new(filename);
118 if path.is_absolute() {
119 path.to_path_buf()
120 } else {
121 self.base_dir.join(path)
122 }
123 }
124}
125
126impl ExternalFileResolver for FilesystemExternalFileResolver {
127 fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
128 let path = self.path_for(filename);
129 if !path.exists() {
130 return Ok(None);
131 }
132 Ok(Some(Arc::new(FileStorage::open(path)?)))
133 }
134}
135
136pub struct FilesystemExternalLinkResolver {
139 base_dir: PathBuf,
140 cache: parking_lot::Mutex<HashMap<PathBuf, Hdf5File>>,
141}
142
143impl FilesystemExternalLinkResolver {
144 pub fn new(base_dir: impl Into<PathBuf>) -> Self {
145 Self {
146 base_dir: base_dir.into(),
147 cache: parking_lot::Mutex::new(HashMap::new()),
148 }
149 }
150
151 fn path_for(&self, filename: &str) -> PathBuf {
152 let path = Path::new(filename);
153 if path.is_absolute() {
154 path.to_path_buf()
155 } else {
156 self.base_dir.join(path)
157 }
158 }
159}
160
161impl ExternalLinkResolver for FilesystemExternalLinkResolver {
162 fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>> {
163 let path = self.path_for(filename);
164 if !path.exists() {
165 return Ok(None);
166 }
167
168 if let Some(file) = self.cache.lock().get(&path).cloned() {
169 return Ok(Some(file));
170 }
171
172 let file = Hdf5File::open(&path)?;
173 self.cache.lock().insert(path, file.clone());
174 Ok(Some(file))
175 }
176}
177
178pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
180
181#[derive(Clone)]
187pub struct Hdf5File {
188 context: Arc<FileContext>,
189}
190
191pub(crate) struct FileContext {
192 pub(crate) storage: DynStorage,
193 pub(crate) superblock: Superblock,
194 pub(crate) chunk_cache: Arc<ChunkCache>,
195 pub(crate) header_cache: HeaderCache,
196 pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
197 pub(crate) filter_registry: Arc<FilterRegistry>,
198 pub(crate) external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
199 pub(crate) external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
200 pub(crate) external_file_cache: parking_lot::Mutex<HashMap<String, DynStorage>>,
201 sohm_table: OnceLock<std::result::Result<Option<SharedMessageTableRef>, String>>,
202 full_file_cache: OnceLock<StorageBuffer>,
203}
204
205impl FileContext {
206 pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
207 self.storage.read_range(offset, len)
208 }
209
210 pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
211 if let Some(buffer) = self.full_file_cache.get() {
212 return Ok(buffer.clone());
213 }
214
215 let len = usize::try_from(self.storage.len()).map_err(|_| {
216 Error::InvalidData("file size exceeds platform usize capacity".to_string())
217 })?;
218 let buffer = self.storage.read_range(0, len)?;
219 let _ = self.full_file_cache.set(buffer);
220 Ok(self
221 .full_file_cache
222 .get()
223 .expect("full-file buffer must exist after successful initialization")
224 .clone())
225 }
226
227 pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
228 {
229 let cache = self.header_cache.lock();
230 if let Some(hdr) = cache.get(&addr) {
231 return Ok(Arc::clone(hdr));
232 }
233 }
234
235 let mut hdr = ObjectHeader::parse_at_storage(
236 self.storage.as_ref(),
237 addr,
238 self.superblock.offset_size,
239 self.superblock.length_size,
240 )?;
241 hdr.resolve_shared_messages_storage_with_sohm(
242 self.storage.as_ref(),
243 self.superblock.offset_size,
244 self.superblock.length_size,
245 |heap_id, message_type| self.resolve_sohm_message(heap_id, message_type),
246 )?;
247 let arc = Arc::new(hdr);
248 let mut cache = self.header_cache.lock();
249 cache.insert(addr, Arc::clone(&arc));
250 Ok(arc)
251 }
252
253 fn resolve_sohm_message(
254 &self,
255 heap_id: &[u8],
256 message_type: u16,
257 ) -> Result<Option<HdfMessage>> {
258 let Some(table) = self.sohm_table()? else {
259 return Ok(None);
260 };
261 table.resolve_heap_message(
262 heap_id,
263 message_type,
264 self.storage.as_ref(),
265 self.superblock.offset_size,
266 self.superblock.length_size,
267 )
268 }
269
270 fn sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
271 let cached = self
272 .sohm_table
273 .get_or_init(|| self.load_sohm_table().map_err(|err| err.to_string()));
274 match cached {
275 Ok(table) => Ok(table.clone()),
276 Err(message) => Err(Error::InvalidData(format!(
277 "failed to load SOHM table: {message}"
278 ))),
279 }
280 }
281
282 fn load_sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
283 let Some(extension_address) = self.superblock.extension_address else {
284 return Ok(None);
285 };
286 let extension = ObjectHeader::parse_at_storage(
287 self.storage.as_ref(),
288 extension_address,
289 self.superblock.offset_size,
290 self.superblock.length_size,
291 )?;
292
293 let shared_table = extension.messages.iter().find_map(|message| match message {
294 HdfMessage::SharedTable(table) => Some(table),
295 _ => None,
296 });
297 let Some(shared_table) = shared_table else {
298 return Ok(None);
299 };
300
301 let table = crate::shared_message_table::SharedMessageTable::parse_at_storage(
302 self.storage.as_ref(),
303 shared_table.table_address,
304 shared_table.num_indices,
305 self.superblock.offset_size,
306 )?;
307 Ok(Some(Arc::new(table)))
308 }
309
310 pub(crate) fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
311 if let Some(storage) = self.external_file_cache.lock().get(filename).cloned() {
312 return Ok(Some(storage));
313 }
314
315 let Some(resolver) = self.external_file_resolver.as_ref() else {
316 return Ok(None);
317 };
318 let Some(storage) = resolver.resolve_external_file(filename)? else {
319 return Ok(None);
320 };
321 self.external_file_cache
322 .lock()
323 .insert(filename.to_string(), storage.clone());
324 Ok(Some(storage))
325 }
326}
327
328impl Hdf5File {
329 fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
330 let superblock = Superblock::parse_from_storage(storage.as_ref())?;
331 let cache = Arc::new(ChunkCache::new(
332 options.chunk_cache_bytes,
333 options.chunk_cache_slots,
334 ));
335 let registry = options.filter_registry.unwrap_or_default();
336 let external_file_resolver = options.external_file_resolver;
337 let external_link_resolver = options.external_link_resolver;
338
339 Ok(Hdf5File {
340 context: Arc::new(FileContext {
341 storage,
342 superblock,
343 chunk_cache: cache,
344 header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
345 dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
346 filter_registry: Arc::new(registry),
347 external_file_resolver,
348 external_link_resolver,
349 external_file_cache: parking_lot::Mutex::new(HashMap::new()),
350 sohm_table: OnceLock::new(),
351 full_file_cache: OnceLock::new(),
352 }),
353 })
354 }
355
356 pub fn open(path: impl AsRef<Path>) -> Result<Self> {
358 Self::open_with_options(path, OpenOptions::default())
359 }
360
361 pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
363 let path = path.as_ref();
364 let mut options = options;
365 if options.external_file_resolver.is_none() {
366 let base_dir = path
367 .parent()
368 .map(Path::to_path_buf)
369 .unwrap_or_else(|| PathBuf::from("."));
370 options.external_file_resolver =
371 Some(Arc::new(FilesystemExternalFileResolver::new(base_dir)));
372 }
373 Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
374 }
375
376 pub fn from_bytes(data: &[u8]) -> Result<Self> {
380 Self::from_bytes_with_options(data, OpenOptions::default())
381 }
382
383 pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
387 Self::from_vec_with_options(data.to_vec(), options)
388 }
389
390 pub fn from_vec(data: Vec<u8>) -> Result<Self> {
392 Self::from_vec_with_options(data, OpenOptions::default())
393 }
394
395 pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
397 Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
398 }
399
400 pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
404 Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
405 }
406
407 pub fn from_storage(storage: DynStorage) -> Result<Self> {
409 Self::from_storage_with_options(storage, OpenOptions::default())
410 }
411
412 pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
414 Self::from_storage_impl(storage, options)
415 }
416
417 pub fn superblock(&self) -> &Superblock {
419 &self.context.superblock
420 }
421
422 pub fn storage(&self) -> &dyn Storage {
424 self.context.storage.as_ref()
425 }
426
427 pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
431 self.context.get_or_parse_header(addr)
432 }
433
434 pub fn root_group(&self) -> Result<Group> {
436 let addr = self.context.superblock.root_object_header_address()?;
437
438 Ok(Group::new(
439 self.context.clone(),
440 addr,
441 "/".to_string(),
442 addr, ))
444 }
445
446 pub fn dataset(&self, path: &str) -> Result<Dataset> {
448 let parts: Vec<&str> = path
449 .trim_start_matches('/')
450 .split('/')
451 .filter(|s| !s.is_empty())
452 .collect();
453 let normalized_path = format!("/{}", parts.join("/"));
454
455 if parts.is_empty() {
456 return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
457 }
458
459 if let Some(template) = self
460 .context
461 .dataset_path_cache
462 .lock()
463 .get(&normalized_path)
464 .cloned()
465 {
466 return Ok(Dataset::from_template(self.context.clone(), template));
467 }
468
469 let mut group = self.root_group()?;
470 for &part in &parts[..parts.len() - 1] {
471 group = group.group(part).map_err(|e| e.with_context(path))?;
472 }
473
474 let dataset = group
475 .dataset(parts[parts.len() - 1])
476 .map_err(|e| e.with_context(path))?;
477 if Arc::ptr_eq(&dataset.context, &self.context) {
478 self.context
479 .dataset_path_cache
480 .lock()
481 .insert(normalized_path, dataset.template());
482 }
483 Ok(dataset)
484 }
485
486 pub fn group(&self, path: &str) -> Result<Group> {
488 let parts: Vec<&str> = path
489 .trim_start_matches('/')
490 .split('/')
491 .filter(|s| !s.is_empty())
492 .collect();
493
494 let mut group = self.root_group()?;
495 for &part in &parts {
496 group = group.group(part)?;
497 }
498
499 Ok(group)
500 }
501}
502
503#[cfg(test)]
504mod tests {
505 use super::*;
506
507 #[test]
508 fn test_open_options_default() {
509 let opts = OpenOptions::default();
510 assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
511 assert_eq!(opts.chunk_cache_slots, 521);
512 }
513
514 #[test]
515 fn test_invalid_file() {
516 let data = b"this is not an HDF5 file";
517 let result = Hdf5File::from_bytes(data);
518 assert!(result.is_err());
519 }
520
521 #[test]
522 fn filesystem_external_file_resolver_reads_relative_file() {
523 let dir = tempfile::tempdir().unwrap();
524 let path = dir.path().join("raw.bin");
525 std::fs::write(&path, b"abcdef").unwrap();
526
527 let resolver = FilesystemExternalFileResolver::new(dir.path());
528 let storage = resolver
529 .resolve_external_file("raw.bin")
530 .unwrap()
531 .expect("raw file should resolve");
532 let bytes = storage.read_range(2, 3).unwrap();
533 assert_eq!(bytes.as_ref(), b"cde");
534 }
535}