1pub mod checksum;
2pub mod error;
3pub mod io;
4
5pub mod superblock;
7
8pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod shared_message_table;
18pub mod symbol_table;
19
20pub mod messages;
22pub mod object_header;
23
24pub mod attribute_api;
26pub mod dataset;
27pub mod datatype_api;
28pub mod group;
29pub mod reference;
30pub mod storage;
31
32pub mod filters;
34
35pub mod cache;
37
38use std::collections::HashMap;
39use std::io::ErrorKind;
40use std::path::{Component, Path, PathBuf};
41use std::sync::{Arc, OnceLock};
42
43use memmap2::Mmap;
44use cache::ChunkCache;
47use error::{Error, Result};
48use group::Group;
49use messages::HdfMessage;
50use object_header::ObjectHeader;
51use shared_message_table::SharedMessageTableRef;
52use storage::DynStorage;
53use superblock::Superblock;
54
55pub use attribute_api::Attribute;
57pub use cache::ChunkCacheStats;
58use dataset::DatasetTemplate;
59pub use dataset::{Dataset, DatasetChunk, DatasetChunkIterator, SliceInfo, SliceInfoElem};
60pub use datatype_api::{
61 dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
62 StringPadding, StringSize, VarLenKind,
63};
64pub use error::ByteOrder;
65pub use filters::FilterRegistry;
66pub use messages::datatype::Datatype;
67pub use storage::{
68 BlockCacheStats, BlockCacheStorage, BytesStorage, FileStorage, MmapStorage,
69 RangeRequestStorage, Storage, StorageBuffer,
70};
71
72pub struct OpenOptions {
74 pub chunk_cache_bytes: usize,
76 pub chunk_cache_slots: usize,
78 pub filter_registry: Option<FilterRegistry>,
80 pub external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
83 pub external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
85}
86
87impl Default for OpenOptions {
88 fn default() -> Self {
89 OpenOptions {
90 chunk_cache_bytes: 64 * 1024 * 1024,
91 chunk_cache_slots: 521,
92 filter_registry: None,
93 external_file_resolver: None,
94 external_link_resolver: None,
95 }
96 }
97}
98
99pub trait ExternalFileResolver: Send + Sync {
101 fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>>;
102}
103
104pub trait ExternalLinkResolver: Send + Sync {
106 fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>>;
107}
108
109fn resolve_path_within_base(
110 base_dir: &Path,
111 filename: &str,
112 description: &str,
113) -> Result<Option<PathBuf>> {
114 let path = Path::new(filename);
115 if path.as_os_str().is_empty() {
116 return Err(Error::InvalidData(format!("{description} path is empty")));
117 }
118
119 if path.is_absolute() {
120 return Err(Error::InvalidData(format!(
121 "{description} path must be relative: {filename}"
122 )));
123 }
124
125 if path.components().any(|component| {
126 matches!(
127 component,
128 Component::Prefix(_) | Component::RootDir | Component::ParentDir
129 )
130 }) {
131 return Err(Error::InvalidData(format!(
132 "{description} path must stay within the resolver base directory: {filename}"
133 )));
134 }
135
136 let base = match base_dir.canonicalize() {
137 Ok(path) => path,
138 Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
139 Err(err) => return Err(err.into()),
140 };
141 let candidate = base.join(path);
142 let resolved = match candidate.canonicalize() {
143 Ok(path) => path,
144 Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
145 Err(err) => return Err(err.into()),
146 };
147
148 if !resolved.starts_with(&base) {
149 return Err(Error::InvalidData(format!(
150 "{description} path escapes the resolver base directory: {filename}"
151 )));
152 }
153
154 Ok(Some(resolved))
155}
156
157#[derive(Debug, Clone)]
159pub struct FilesystemExternalFileResolver {
160 base_dir: PathBuf,
161}
162
163impl FilesystemExternalFileResolver {
164 pub fn new(base_dir: impl Into<PathBuf>) -> Self {
165 Self {
166 base_dir: base_dir.into(),
167 }
168 }
169
170 fn path_for(&self, filename: &str) -> Result<Option<PathBuf>> {
171 resolve_path_within_base(&self.base_dir, filename, "external raw data file")
172 }
173}
174
175impl ExternalFileResolver for FilesystemExternalFileResolver {
176 fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
177 let Some(path) = self.path_for(filename)? else {
178 return Ok(None);
179 };
180 Ok(Some(Arc::new(FileStorage::open(path)?)))
181 }
182}
183
184pub struct FilesystemExternalLinkResolver {
187 base_dir: PathBuf,
188 cache: parking_lot::Mutex<HashMap<PathBuf, Hdf5File>>,
189}
190
191impl FilesystemExternalLinkResolver {
192 pub fn new(base_dir: impl Into<PathBuf>) -> Self {
193 Self {
194 base_dir: base_dir.into(),
195 cache: parking_lot::Mutex::new(HashMap::new()),
196 }
197 }
198
199 fn path_for(&self, filename: &str) -> Result<Option<PathBuf>> {
200 resolve_path_within_base(&self.base_dir, filename, "external link")
201 }
202}
203
204impl ExternalLinkResolver for FilesystemExternalLinkResolver {
205 fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>> {
206 let Some(path) = self.path_for(filename)? else {
207 return Ok(None);
208 };
209
210 if let Some(file) = self.cache.lock().get(&path).cloned() {
211 return Ok(Some(file));
212 }
213
214 let file = Hdf5File::open(&path)?;
215 self.cache.lock().insert(path, file.clone());
216 Ok(Some(file))
217 }
218}
219
220pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
222
223#[derive(Clone)]
229pub struct Hdf5File {
230 context: Arc<FileContext>,
231}
232
233pub(crate) struct FileContext {
234 pub(crate) storage: DynStorage,
235 pub(crate) superblock: Superblock,
236 pub(crate) chunk_cache: Arc<ChunkCache>,
237 pub(crate) header_cache: HeaderCache,
238 pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
239 pub(crate) filter_registry: Arc<FilterRegistry>,
240 pub(crate) external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
241 pub(crate) external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
242 pub(crate) external_file_cache: parking_lot::Mutex<HashMap<String, DynStorage>>,
243 sohm_table: OnceLock<std::result::Result<Option<SharedMessageTableRef>, String>>,
244 full_file_cache: OnceLock<StorageBuffer>,
245}
246
247impl FileContext {
248 pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
249 self.storage.read_range(offset, len)
250 }
251
252 pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
253 if let Some(buffer) = self.full_file_cache.get() {
254 return Ok(buffer.clone());
255 }
256
257 let len = usize::try_from(self.storage.len()).map_err(|_| {
258 Error::InvalidData("file size exceeds platform usize capacity".to_string())
259 })?;
260 let buffer = self.storage.read_range(0, len)?;
261 let _ = self.full_file_cache.set(buffer);
262 Ok(self
263 .full_file_cache
264 .get()
265 .expect("full-file buffer must exist after successful initialization")
266 .clone())
267 }
268
269 pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
270 {
271 let cache = self.header_cache.lock();
272 if let Some(hdr) = cache.get(&addr) {
273 return Ok(Arc::clone(hdr));
274 }
275 }
276
277 let mut hdr = ObjectHeader::parse_at_storage(
278 self.storage.as_ref(),
279 addr,
280 self.superblock.offset_size,
281 self.superblock.length_size,
282 )?;
283 hdr.resolve_shared_messages_storage_with_sohm(
284 self.storage.as_ref(),
285 self.superblock.offset_size,
286 self.superblock.length_size,
287 |heap_id, message_type| self.resolve_sohm_message(heap_id, message_type),
288 )?;
289 let arc = Arc::new(hdr);
290 let mut cache = self.header_cache.lock();
291 cache.insert(addr, Arc::clone(&arc));
292 Ok(arc)
293 }
294
295 fn resolve_sohm_message(
296 &self,
297 heap_id: &[u8],
298 message_type: u16,
299 ) -> Result<Option<HdfMessage>> {
300 let Some(table) = self.sohm_table()? else {
301 return Ok(None);
302 };
303 table.resolve_heap_message(
304 heap_id,
305 message_type,
306 self.storage.as_ref(),
307 self.superblock.offset_size,
308 self.superblock.length_size,
309 Some(self.filter_registry.as_ref()),
310 )
311 }
312
313 fn sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
314 let cached = self
315 .sohm_table
316 .get_or_init(|| self.load_sohm_table().map_err(|err| err.to_string()));
317 match cached {
318 Ok(table) => Ok(table.clone()),
319 Err(message) => Err(Error::InvalidData(format!(
320 "failed to load SOHM table: {message}"
321 ))),
322 }
323 }
324
325 fn load_sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
326 let Some(extension_address) = self.superblock.extension_address else {
327 return Ok(None);
328 };
329 let extension = ObjectHeader::parse_at_storage(
330 self.storage.as_ref(),
331 extension_address,
332 self.superblock.offset_size,
333 self.superblock.length_size,
334 )?;
335
336 let shared_table = extension.messages.iter().find_map(|message| match message {
337 HdfMessage::SharedTable(table) => Some(table),
338 _ => None,
339 });
340 let Some(shared_table) = shared_table else {
341 return Ok(None);
342 };
343
344 let table = crate::shared_message_table::SharedMessageTable::parse_at_storage(
345 self.storage.as_ref(),
346 shared_table.table_address,
347 shared_table.num_indices,
348 self.superblock.offset_size,
349 )?;
350 Ok(Some(Arc::new(table)))
351 }
352
353 pub(crate) fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
354 if let Some(storage) = self.external_file_cache.lock().get(filename).cloned() {
355 return Ok(Some(storage));
356 }
357
358 let Some(resolver) = self.external_file_resolver.as_ref() else {
359 return Ok(None);
360 };
361 let Some(storage) = resolver.resolve_external_file(filename)? else {
362 return Ok(None);
363 };
364 self.external_file_cache
365 .lock()
366 .insert(filename.to_string(), storage.clone());
367 Ok(Some(storage))
368 }
369}
370
371impl Hdf5File {
372 fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
373 let superblock = Superblock::parse_from_storage(storage.as_ref())?;
374 let cache = Arc::new(ChunkCache::new(
375 options.chunk_cache_bytes,
376 options.chunk_cache_slots,
377 ));
378 let registry = options.filter_registry.unwrap_or_default();
379 let external_file_resolver = options.external_file_resolver;
380 let external_link_resolver = options.external_link_resolver;
381
382 Ok(Hdf5File {
383 context: Arc::new(FileContext {
384 storage,
385 superblock,
386 chunk_cache: cache,
387 header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
388 dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
389 filter_registry: Arc::new(registry),
390 external_file_resolver,
391 external_link_resolver,
392 external_file_cache: parking_lot::Mutex::new(HashMap::new()),
393 sohm_table: OnceLock::new(),
394 full_file_cache: OnceLock::new(),
395 }),
396 })
397 }
398
399 pub fn open(path: impl AsRef<Path>) -> Result<Self> {
401 Self::open_with_options(path, OpenOptions::default())
402 }
403
404 pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
406 let path = path.as_ref();
407 Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
408 }
409
410 pub fn from_bytes(data: &[u8]) -> Result<Self> {
414 Self::from_bytes_with_options(data, OpenOptions::default())
415 }
416
417 pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
421 Self::from_vec_with_options(data.to_vec(), options)
422 }
423
424 pub fn from_vec(data: Vec<u8>) -> Result<Self> {
426 Self::from_vec_with_options(data, OpenOptions::default())
427 }
428
429 pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
431 Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
432 }
433
434 pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
438 Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
439 }
440
441 pub fn from_storage(storage: DynStorage) -> Result<Self> {
443 Self::from_storage_with_options(storage, OpenOptions::default())
444 }
445
446 pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
448 Self::from_storage_impl(storage, options)
449 }
450
451 pub fn superblock(&self) -> &Superblock {
453 &self.context.superblock
454 }
455
456 pub fn storage(&self) -> &dyn Storage {
458 self.context.storage.as_ref()
459 }
460
461 pub fn chunk_cache_stats(&self) -> ChunkCacheStats {
463 self.context.chunk_cache.stats()
464 }
465
466 pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
470 self.context.get_or_parse_header(addr)
471 }
472
473 pub fn root_group(&self) -> Result<Group> {
475 let addr = self.context.superblock.root_object_header_address()?;
476
477 Ok(Group::new(
478 self.context.clone(),
479 addr,
480 "/".to_string(),
481 addr, ))
483 }
484
485 pub fn dataset(&self, path: &str) -> Result<Dataset> {
487 let parts: Vec<&str> = path
488 .trim_start_matches('/')
489 .split('/')
490 .filter(|s| !s.is_empty())
491 .collect();
492 let normalized_path = format!("/{}", parts.join("/"));
493
494 if parts.is_empty() {
495 return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
496 }
497
498 if let Some(template) = self
499 .context
500 .dataset_path_cache
501 .lock()
502 .get(&normalized_path)
503 .cloned()
504 {
505 return Ok(Dataset::from_template(self.context.clone(), template));
506 }
507
508 let mut group = self.root_group()?;
509 for &part in &parts[..parts.len() - 1] {
510 group = group.group(part).map_err(|e| e.with_context(path))?;
511 }
512
513 let dataset = group
514 .dataset(parts[parts.len() - 1])
515 .map_err(|e| e.with_context(path))?;
516 if Arc::ptr_eq(&dataset.context, &self.context) {
517 self.context
518 .dataset_path_cache
519 .lock()
520 .insert(normalized_path, dataset.template());
521 }
522 Ok(dataset)
523 }
524
525 pub fn group(&self, path: &str) -> Result<Group> {
527 let parts: Vec<&str> = path
528 .trim_start_matches('/')
529 .split('/')
530 .filter(|s| !s.is_empty())
531 .collect();
532
533 let mut group = self.root_group()?;
534 for &part in &parts {
535 group = group.group(part)?;
536 }
537
538 Ok(group)
539 }
540}
541
542#[cfg(test)]
543mod tests {
544 use super::*;
545
546 #[test]
547 fn open_options_default() {
548 let opts = OpenOptions::default();
549 assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
550 assert_eq!(opts.chunk_cache_slots, 521);
551 assert!(opts.external_file_resolver.is_none());
552 }
553
554 #[test]
555 fn invalid_file() {
556 let data = b"this is not an HDF5 file";
557 let result = Hdf5File::from_bytes(data);
558 assert!(result.is_err());
559 }
560
561 #[test]
562 fn filesystem_external_file_resolver_reads_relative_file() {
563 let dir = tempfile::tempdir().unwrap();
564 let path = dir.path().join("raw.bin");
565 std::fs::write(&path, b"abcdef").unwrap();
566
567 let resolver = FilesystemExternalFileResolver::new(dir.path());
568 let storage = resolver
569 .resolve_external_file("raw.bin")
570 .unwrap()
571 .expect("raw file should resolve");
572 let bytes = storage.read_range(2, 3).unwrap();
573 assert_eq!(bytes.as_ref(), b"cde");
574 }
575
576 #[test]
577 fn filesystem_external_file_resolver_rejects_absolute_path() {
578 let dir = tempfile::tempdir().unwrap();
579 let path = dir.path().join("raw.bin");
580 std::fs::write(&path, b"abcdef").unwrap();
581
582 let resolver = FilesystemExternalFileResolver::new(dir.path());
583 let err = match resolver.resolve_external_file(path.to_str().unwrap()) {
584 Ok(_) => panic!("absolute external file path should be rejected"),
585 Err(err) => err,
586 };
587 assert!(err.to_string().contains("must be relative"));
588 }
589
590 #[test]
591 fn filesystem_external_file_resolver_rejects_parent_component() {
592 let dir = tempfile::tempdir().unwrap();
593 let resolver = FilesystemExternalFileResolver::new(dir.path());
594
595 let err = match resolver.resolve_external_file("../raw.bin") {
596 Ok(_) => panic!("parent external file path should be rejected"),
597 Err(err) => err,
598 };
599 assert!(err.to_string().contains("resolver base directory"));
600 }
601
602 #[cfg(unix)]
603 #[test]
604 fn filesystem_external_file_resolver_rejects_symlink_escape() {
605 use std::os::unix::fs::symlink;
606
607 let dir = tempfile::tempdir().unwrap();
608 let outside = tempfile::tempdir().unwrap();
609 let outside_path = outside.path().join("raw.bin");
610 std::fs::write(&outside_path, b"abcdef").unwrap();
611 symlink(&outside_path, dir.path().join("raw.bin")).unwrap();
612
613 let resolver = FilesystemExternalFileResolver::new(dir.path());
614 let err = match resolver.resolve_external_file("raw.bin") {
615 Ok(_) => panic!("symlink escape should be rejected"),
616 Err(err) => err,
617 };
618 assert!(err.to_string().contains("escapes"));
619 }
620
621 #[test]
622 fn filesystem_external_link_resolver_rejects_absolute_path() {
623 let dir = tempfile::tempdir().unwrap();
624 let path = dir.path().join("linked.h5");
625 std::fs::write(&path, b"not really hdf5").unwrap();
626
627 let resolver = FilesystemExternalLinkResolver::new(dir.path());
628 let err = match resolver.resolve_external_link(path.to_str().unwrap()) {
629 Ok(_) => panic!("absolute external link path should be rejected"),
630 Err(err) => err,
631 };
632 assert!(err.to_string().contains("must be relative"));
633 }
634
635 #[test]
636 fn filesystem_external_link_resolver_rejects_parent_component() {
637 let dir = tempfile::tempdir().unwrap();
638 let resolver = FilesystemExternalLinkResolver::new(dir.path());
639
640 let err = match resolver.resolve_external_link("../linked.h5") {
641 Ok(_) => panic!("parent external link path should be rejected"),
642 Err(err) => err,
643 };
644 assert!(err.to_string().contains("resolver base directory"));
645 }
646
647 #[cfg(unix)]
648 #[test]
649 fn filesystem_external_link_resolver_rejects_symlink_escape() {
650 use std::os::unix::fs::symlink;
651
652 let dir = tempfile::tempdir().unwrap();
653 let outside = tempfile::tempdir().unwrap();
654 let outside_path = outside.path().join("linked.h5");
655 std::fs::write(&outside_path, b"not really hdf5").unwrap();
656 symlink(&outside_path, dir.path().join("linked.h5")).unwrap();
657
658 let resolver = FilesystemExternalLinkResolver::new(dir.path());
659 let err = match resolver.resolve_external_link("linked.h5") {
660 Ok(_) => panic!("symlink escape should be rejected"),
661 Err(err) => err,
662 };
663 assert!(err.to_string().contains("escapes"));
664 }
665}