Skip to main content

hdf5_reader/
lib.rs

1pub mod checksum;
2pub mod error;
3pub mod io;
4
5// Level 0 — File Metadata
6pub mod superblock;
7
8// Level 1 — File Infrastructure
9pub mod btree_v1;
10pub mod btree_v2;
11pub mod chunk_index;
12pub mod extensible_array;
13pub mod fixed_array;
14pub mod fractal_heap;
15pub mod global_heap;
16pub mod local_heap;
17pub mod shared_message_table;
18pub mod symbol_table;
19
20// Level 2 — Data Objects
21pub mod messages;
22pub mod object_header;
23
24// High-level API
25pub mod attribute_api;
26pub mod dataset;
27pub mod datatype_api;
28pub mod group;
29pub mod reference;
30pub mod storage;
31
32// Filters
33pub mod filters;
34
35// Utilities
36pub mod cache;
37
38use std::collections::HashMap;
39#[cfg(unix)]
40use std::ffi::{CString, OsStr};
41use std::fs::File;
42use std::io::ErrorKind;
43#[cfg(unix)]
44use std::os::fd::{AsRawFd, FromRawFd};
45#[cfg(unix)]
46use std::os::unix::ffi::OsStrExt;
47use std::path::{Component, Path, PathBuf};
48use std::sync::{Arc, OnceLock};
49
50use memmap2::Mmap;
51// parking_lot::Mutex used via fully-qualified paths in HeaderCache and constructors.
52
53use cache::ChunkCache;
54use error::{Error, Result};
55use group::Group;
56use messages::HdfMessage;
57use object_header::ObjectHeader;
58use shared_message_table::SharedMessageTableRef;
59use storage::DynStorage;
60use superblock::Superblock;
61
62// Re-exports
63pub use attribute_api::Attribute;
64pub use cache::ChunkCacheStats;
65use dataset::DatasetTemplate;
66pub use dataset::{Dataset, DatasetChunk, DatasetChunkIterator, SliceInfo, SliceInfoElem};
67pub use datatype_api::{
68    dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
69    StringPadding, StringSize, VarLenKind,
70};
71pub use error::ByteOrder;
72pub use filters::FilterRegistry;
73pub use messages::datatype::Datatype;
74pub use storage::{
75    BlockCacheStats, BlockCacheStorage, BytesStorage, FileStorage, MmapStorage,
76    RangeRequestStorage, Storage, StorageBuffer,
77};
78
79/// Configuration options for opening an HDF5 file.
80pub struct OpenOptions {
81    /// Maximum bytes for the chunk cache. Default: 64 MiB.
82    pub chunk_cache_bytes: usize,
83    /// Maximum number of chunk cache slots. Default: 521.
84    pub chunk_cache_slots: usize,
85    /// Custom filter registry. If `None`, the default built-in filters are used.
86    pub filter_registry: Option<FilterRegistry>,
87    /// Resolver for HDF5 external raw data files. If `None`, external raw data
88    /// files are not resolved.
89    pub external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
90    /// Optional resolver for HDF5 external links.
91    pub external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
92}
93
94impl Default for OpenOptions {
95    fn default() -> Self {
96        OpenOptions {
97            chunk_cache_bytes: 64 * 1024 * 1024,
98            chunk_cache_slots: 521,
99            filter_registry: None,
100            external_file_resolver: None,
101            external_link_resolver: None,
102        }
103    }
104}
105
106/// Resolves file names from HDF5 External Data Files messages to storage.
107///
108/// Implementations are responsible for their own path security policy. The
109/// built-in [`FilesystemExternalFileResolver`] confines normal paths to a base
110/// directory and, on Unix, opens paths through an anchored directory handle
111/// without following symlinks. On non-Unix platforms it falls back to
112/// canonicalize-then-open, so attacker-writable resolver roots are out of
113/// scope there.
114pub trait ExternalFileResolver: Send + Sync {
115    fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>>;
116}
117
118/// Resolves HDF5 external links to another opened file.
119///
120/// Implementations are responsible for their own path security policy. The
121/// built-in [`FilesystemExternalLinkResolver`] confines normal paths to a base
122/// directory and, on Unix, opens paths through an anchored directory handle
123/// without following symlinks. On non-Unix platforms it falls back to
124/// canonicalize-then-open, so attacker-writable resolver roots are out of
125/// scope there.
126pub trait ExternalLinkResolver: Send + Sync {
127    fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>>;
128}
129
130fn normalize_resolver_path(filename: &str, description: &str) -> Result<PathBuf> {
131    let path = Path::new(filename);
132    if path.as_os_str().is_empty() {
133        return Err(Error::InvalidData(format!("{description} path is empty")));
134    }
135
136    if path.is_absolute() {
137        return Err(Error::InvalidData(format!(
138            "{description} path must be relative: {filename}"
139        )));
140    }
141
142    let mut normalized = PathBuf::new();
143    for component in path.components() {
144        match component {
145            Component::Normal(name) => normalized.push(name),
146            Component::CurDir => {}
147            Component::Prefix(_) | Component::RootDir | Component::ParentDir => {
148                return Err(Error::InvalidData(format!(
149                    "{description} path must stay within the resolver base directory: {filename}"
150                )));
151            }
152        }
153    }
154
155    if normalized.as_os_str().is_empty() {
156        return Err(Error::InvalidData(format!("{description} path is empty")));
157    }
158
159    Ok(normalized)
160}
161
162#[cfg(not(unix))]
163fn open_external_file_within_base(
164    base_dir: &Path,
165    relative_path: &Path,
166    description: &str,
167    filename: &str,
168) -> Result<Option<File>> {
169    let base = match base_dir.canonicalize() {
170        Ok(path) => path,
171        Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
172        Err(err) => return Err(err.into()),
173    };
174    let candidate = base.join(relative_path);
175    let resolved = match candidate.canonicalize() {
176        Ok(path) => path,
177        Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
178        Err(err) => return Err(err.into()),
179    };
180
181    if !resolved.starts_with(&base) {
182        return Err(Error::InvalidData(format!(
183            "{description} path escapes the resolver base directory: {filename}"
184        )));
185    }
186
187    Ok(Some(File::open(resolved)?))
188}
189
190#[cfg(unix)]
191fn open_external_file_within_base(
192    base_dir: &Path,
193    relative_path: &Path,
194    description: &str,
195    filename: &str,
196) -> Result<Option<File>> {
197    let mut dir = match open_unix_path(
198        base_dir,
199        libc::O_RDONLY | libc::O_CLOEXEC | libc::O_NOFOLLOW,
200    ) {
201        Ok(file) => file,
202        Err(err) if path_lookup_is_missing(&err) => return Ok(None),
203        Err(err) if path_lookup_is_symlink(&err) => {
204            return Err(Error::InvalidData(format!(
205                "{description} resolver base directory must not be a symlink"
206            )));
207        }
208        Err(err) => return Err(err.into()),
209    };
210    if !dir.metadata()?.is_dir() {
211        return Ok(None);
212    }
213
214    let parts: Vec<&OsStr> = relative_path
215        .components()
216        .filter_map(|component| match component {
217            Component::Normal(name) => Some(name),
218            _ => None,
219        })
220        .collect();
221
222    let Some((leaf, parents)) = parts.split_last() else {
223        return Err(Error::InvalidData(format!("{description} path is empty")));
224    };
225
226    for parent in parents {
227        dir = match open_unix_child(
228            &dir,
229            parent,
230            libc::O_RDONLY | libc::O_CLOEXEC | libc::O_NOFOLLOW,
231        ) {
232            Ok(file) => file,
233            Err(err) if path_lookup_is_missing(&err) => return Ok(None),
234            Err(err) if path_lookup_is_symlink(&err) => {
235                return Err(symlink_resolver_error(description, filename));
236            }
237            Err(err) => return Err(err.into()),
238        };
239        if !dir.metadata()?.is_dir() {
240            return Ok(None);
241        }
242    }
243
244    let file = match open_unix_child(
245        &dir,
246        leaf,
247        libc::O_RDONLY | libc::O_CLOEXEC | libc::O_NOFOLLOW,
248    ) {
249        Ok(file) => file,
250        Err(err) if path_lookup_is_missing(&err) => return Ok(None),
251        Err(err) if path_lookup_is_symlink(&err) => {
252            return Err(symlink_resolver_error(description, filename));
253        }
254        Err(err) => return Err(err.into()),
255    };
256
257    if file.metadata()?.is_dir() {
258        return Err(Error::InvalidData(format!(
259            "{description} path resolves to a directory: {filename}"
260        )));
261    }
262
263    Ok(Some(file))
264}
265
266#[cfg(unix)]
267fn open_unix_path(path: &Path, flags: libc::c_int) -> std::io::Result<File> {
268    let path = CString::new(path.as_os_str().as_bytes()).map_err(|_| {
269        std::io::Error::new(
270            std::io::ErrorKind::InvalidInput,
271            "filesystem path contains an interior NUL byte",
272        )
273    })?;
274    let fd = unsafe { libc::open(path.as_ptr(), flags) };
275    file_from_unix_fd(fd)
276}
277
278#[cfg(unix)]
279fn open_unix_child(dir: &File, name: &OsStr, flags: libc::c_int) -> std::io::Result<File> {
280    let name = CString::new(name.as_bytes()).map_err(|_| {
281        std::io::Error::new(
282            std::io::ErrorKind::InvalidInput,
283            "filesystem path contains an interior NUL byte",
284        )
285    })?;
286    let fd = unsafe { libc::openat(dir.as_raw_fd(), name.as_ptr(), flags) };
287    file_from_unix_fd(fd)
288}
289
290#[cfg(unix)]
291fn file_from_unix_fd(fd: libc::c_int) -> std::io::Result<File> {
292    if fd < 0 {
293        Err(std::io::Error::last_os_error())
294    } else {
295        Ok(unsafe { File::from_raw_fd(fd) })
296    }
297}
298
299#[cfg(unix)]
300fn path_lookup_is_missing(err: &std::io::Error) -> bool {
301    err.kind() == ErrorKind::NotFound
302        || matches!(err.raw_os_error(), Some(code) if code == libc::ENOTDIR)
303}
304
305#[cfg(unix)]
306fn path_lookup_is_symlink(err: &std::io::Error) -> bool {
307    matches!(err.raw_os_error(), Some(code) if code == libc::ELOOP)
308}
309
310#[cfg(unix)]
311fn symlink_resolver_error(description: &str, filename: &str) -> Error {
312    Error::InvalidData(format!(
313        "{description} path escapes the resolver base directory or uses a symlink: {filename}"
314    ))
315}
316
317/// Filesystem resolver for external raw data files.
318///
319/// The resolver rejects absolute paths and `..` components. On Unix, it opens
320/// paths relative to `base_dir` using `openat` and `O_NOFOLLOW`, so symlinks
321/// are rejected instead of being followed. On non-Unix platforms,
322/// attacker-writable resolver roots are out of scope.
323#[derive(Debug, Clone)]
324pub struct FilesystemExternalFileResolver {
325    base_dir: PathBuf,
326}
327
328impl FilesystemExternalFileResolver {
329    pub fn new(base_dir: impl Into<PathBuf>) -> Self {
330        Self {
331            base_dir: base_dir.into(),
332        }
333    }
334
335    fn relative_path_for(&self, filename: &str) -> Result<PathBuf> {
336        normalize_resolver_path(filename, "external raw data file")
337    }
338}
339
340impl ExternalFileResolver for FilesystemExternalFileResolver {
341    fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
342        let relative_path = self.relative_path_for(filename)?;
343        let Some(file) = open_external_file_within_base(
344            &self.base_dir,
345            &relative_path,
346            "external raw data file",
347            filename,
348        )?
349        else {
350            return Ok(None);
351        };
352        Ok(Some(Arc::new(FileStorage::from_file(file)?)))
353    }
354}
355
356/// Filesystem resolver for external links. Linked files are cached after the
357/// first successful open.
358///
359/// The resolver rejects absolute paths and `..` components. On Unix, it opens
360/// paths relative to `base_dir` using `openat` and `O_NOFOLLOW`, so symlinks
361/// are rejected instead of being followed. On non-Unix platforms,
362/// attacker-writable resolver roots are out of scope.
363pub struct FilesystemExternalLinkResolver {
364    base_dir: PathBuf,
365    cache: parking_lot::Mutex<HashMap<PathBuf, Hdf5File>>,
366}
367
368impl FilesystemExternalLinkResolver {
369    pub fn new(base_dir: impl Into<PathBuf>) -> Self {
370        Self {
371            base_dir: base_dir.into(),
372            cache: parking_lot::Mutex::new(HashMap::new()),
373        }
374    }
375
376    fn relative_path_for(&self, filename: &str) -> Result<PathBuf> {
377        normalize_resolver_path(filename, "external link")
378    }
379}
380
381impl ExternalLinkResolver for FilesystemExternalLinkResolver {
382    fn resolve_external_link(&self, filename: &str) -> Result<Option<Hdf5File>> {
383        let relative_path = self.relative_path_for(filename)?;
384
385        if let Some(file) = self.cache.lock().get(&relative_path).cloned() {
386            return Ok(Some(file));
387        }
388
389        let Some(opened) = open_external_file_within_base(
390            &self.base_dir,
391            &relative_path,
392            "external link",
393            filename,
394        )?
395        else {
396            return Ok(None);
397        };
398
399        let file = Hdf5File::from_storage(Arc::new(FileStorage::from_file(opened)?))?;
400        self.cache.lock().insert(relative_path, file.clone());
401        Ok(Some(file))
402    }
403}
404
405/// Cache for parsed object headers, keyed by file address.
406pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
407
408/// An opened HDF5 file.
409///
410/// This is the main entry point for reading HDF5 files. Storage is random-
411/// access and range-based, so metadata and data reads do not require an eager
412/// whole-file mapping.
413#[derive(Clone)]
414pub struct Hdf5File {
415    context: Arc<FileContext>,
416}
417
418pub(crate) struct FileContext {
419    pub(crate) storage: DynStorage,
420    pub(crate) superblock: Superblock,
421    pub(crate) chunk_cache: Arc<ChunkCache>,
422    pub(crate) header_cache: HeaderCache,
423    pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
424    pub(crate) filter_registry: Arc<FilterRegistry>,
425    pub(crate) external_file_resolver: Option<Arc<dyn ExternalFileResolver>>,
426    pub(crate) external_link_resolver: Option<Arc<dyn ExternalLinkResolver>>,
427    pub(crate) external_file_cache: parking_lot::Mutex<HashMap<String, DynStorage>>,
428    sohm_table: OnceLock<std::result::Result<Option<SharedMessageTableRef>, String>>,
429    full_file_cache: OnceLock<StorageBuffer>,
430}
431
432impl FileContext {
433    pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
434        self.storage.read_range(offset, len)
435    }
436
437    pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
438        if let Some(buffer) = self.full_file_cache.get() {
439            return Ok(buffer.clone());
440        }
441
442        let len = usize::try_from(self.storage.len()).map_err(|_| {
443            Error::InvalidData("file size exceeds platform usize capacity".to_string())
444        })?;
445        let buffer = self.storage.read_range(0, len)?;
446        let _ = self.full_file_cache.set(buffer);
447        Ok(self
448            .full_file_cache
449            .get()
450            .expect("full-file buffer must exist after successful initialization")
451            .clone())
452    }
453
454    pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
455        {
456            let cache = self.header_cache.lock();
457            if let Some(hdr) = cache.get(&addr) {
458                return Ok(Arc::clone(hdr));
459            }
460        }
461
462        let mut hdr = ObjectHeader::parse_at_storage(
463            self.storage.as_ref(),
464            addr,
465            self.superblock.offset_size,
466            self.superblock.length_size,
467        )?;
468        hdr.resolve_shared_messages_storage_with_sohm(
469            self.storage.as_ref(),
470            self.superblock.offset_size,
471            self.superblock.length_size,
472            |heap_id, message_type| self.resolve_sohm_message(heap_id, message_type),
473        )?;
474        let arc = Arc::new(hdr);
475        let mut cache = self.header_cache.lock();
476        cache.insert(addr, Arc::clone(&arc));
477        Ok(arc)
478    }
479
480    fn resolve_sohm_message(
481        &self,
482        heap_id: &[u8],
483        message_type: u16,
484    ) -> Result<Option<HdfMessage>> {
485        let Some(table) = self.sohm_table()? else {
486            return Ok(None);
487        };
488        table.resolve_heap_message(
489            heap_id,
490            message_type,
491            self.storage.as_ref(),
492            self.superblock.offset_size,
493            self.superblock.length_size,
494            Some(self.filter_registry.as_ref()),
495        )
496    }
497
498    fn sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
499        let cached = self
500            .sohm_table
501            .get_or_init(|| self.load_sohm_table().map_err(|err| err.to_string()));
502        match cached {
503            Ok(table) => Ok(table.clone()),
504            Err(message) => Err(Error::InvalidData(format!(
505                "failed to load SOHM table: {message}"
506            ))),
507        }
508    }
509
510    fn load_sohm_table(&self) -> Result<Option<SharedMessageTableRef>> {
511        let Some(extension_address) = self.superblock.extension_address else {
512            return Ok(None);
513        };
514        let extension = ObjectHeader::parse_at_storage(
515            self.storage.as_ref(),
516            extension_address,
517            self.superblock.offset_size,
518            self.superblock.length_size,
519        )?;
520
521        let shared_table = extension.messages.iter().find_map(|message| match message {
522            HdfMessage::SharedTable(table) => Some(table),
523            _ => None,
524        });
525        let Some(shared_table) = shared_table else {
526            return Ok(None);
527        };
528
529        let table = crate::shared_message_table::SharedMessageTable::parse_at_storage(
530            self.storage.as_ref(),
531            shared_table.table_address,
532            shared_table.num_indices,
533            self.superblock.offset_size,
534        )?;
535        Ok(Some(Arc::new(table)))
536    }
537
538    pub(crate) fn resolve_external_file(&self, filename: &str) -> Result<Option<DynStorage>> {
539        if let Some(storage) = self.external_file_cache.lock().get(filename).cloned() {
540            return Ok(Some(storage));
541        }
542
543        let Some(resolver) = self.external_file_resolver.as_ref() else {
544            return Ok(None);
545        };
546        let Some(storage) = resolver.resolve_external_file(filename)? else {
547            return Ok(None);
548        };
549        self.external_file_cache
550            .lock()
551            .insert(filename.to_string(), storage.clone());
552        Ok(Some(storage))
553    }
554}
555
556impl Hdf5File {
557    fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
558        let superblock = Superblock::parse_from_storage(storage.as_ref())?;
559        let cache = Arc::new(ChunkCache::new(
560            options.chunk_cache_bytes,
561            options.chunk_cache_slots,
562        ));
563        let registry = options.filter_registry.unwrap_or_default();
564        let external_file_resolver = options.external_file_resolver;
565        let external_link_resolver = options.external_link_resolver;
566
567        Ok(Hdf5File {
568            context: Arc::new(FileContext {
569                storage,
570                superblock,
571                chunk_cache: cache,
572                header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
573                dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
574                filter_registry: Arc::new(registry),
575                external_file_resolver,
576                external_link_resolver,
577                external_file_cache: parking_lot::Mutex::new(HashMap::new()),
578                sohm_table: OnceLock::new(),
579                full_file_cache: OnceLock::new(),
580            }),
581        })
582    }
583
584    /// Open an HDF5 file with default options.
585    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
586        Self::open_with_options(path, OpenOptions::default())
587    }
588
589    /// Open an HDF5 file with custom options.
590    pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
591        let path = path.as_ref();
592        Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
593    }
594
595    /// Open an HDF5 file from an in-memory byte slice.
596    ///
597    /// The data is copied into an owned buffer.
598    pub fn from_bytes(data: &[u8]) -> Result<Self> {
599        Self::from_bytes_with_options(data, OpenOptions::default())
600    }
601
602    /// Open an HDF5 file from an in-memory byte slice with custom options.
603    ///
604    /// The data is copied into an owned buffer.
605    pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
606        Self::from_vec_with_options(data.to_vec(), options)
607    }
608
609    /// Open an HDF5 file from an owned byte vector without copying.
610    pub fn from_vec(data: Vec<u8>) -> Result<Self> {
611        Self::from_vec_with_options(data, OpenOptions::default())
612    }
613
614    /// Open an HDF5 file from an owned byte vector with custom options.
615    pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
616        Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
617    }
618
619    /// Open an HDF5 file from an existing memory map with custom options.
620    ///
621    /// This avoids remapping when the caller already owns a read-only mapping.
622    pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
623        Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
624    }
625
626    /// Open an HDF5 file from a custom random-access storage backend.
627    pub fn from_storage(storage: DynStorage) -> Result<Self> {
628        Self::from_storage_with_options(storage, OpenOptions::default())
629    }
630
631    /// Open an HDF5 file from a custom random-access storage backend.
632    pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
633        Self::from_storage_impl(storage, options)
634    }
635
636    /// Get the parsed superblock.
637    pub fn superblock(&self) -> &Superblock {
638        &self.context.superblock
639    }
640
641    /// Access the underlying random-access storage backend.
642    pub fn storage(&self) -> &dyn Storage {
643        self.context.storage.as_ref()
644    }
645
646    /// Return current chunk-cache statistics for this file.
647    pub fn chunk_cache_stats(&self) -> ChunkCacheStats {
648        self.context.chunk_cache.stats()
649    }
650
651    /// Look up or parse an object header at the given address.
652    ///
653    /// Uses the internal cache to avoid re-parsing the same header.
654    pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
655        self.context.get_or_parse_header(addr)
656    }
657
658    /// Get the root group of the file.
659    pub fn root_group(&self) -> Result<Group> {
660        let addr = self.context.superblock.root_object_header_address()?;
661
662        Ok(Group::new(
663            self.context.clone(),
664            addr,
665            "/".to_string(),
666            addr, // root_address = self
667        ))
668    }
669
670    /// Convenience: get a dataset at a path like "/group1/dataset".
671    pub fn dataset(&self, path: &str) -> Result<Dataset> {
672        let parts: Vec<&str> = path
673            .trim_start_matches('/')
674            .split('/')
675            .filter(|s| !s.is_empty())
676            .collect();
677        let normalized_path = format!("/{}", parts.join("/"));
678
679        if parts.is_empty() {
680            return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
681        }
682
683        if let Some(template) = self
684            .context
685            .dataset_path_cache
686            .lock()
687            .get(&normalized_path)
688            .cloned()
689        {
690            return Ok(Dataset::from_template(self.context.clone(), template));
691        }
692
693        let mut group = self.root_group()?;
694        for &part in &parts[..parts.len() - 1] {
695            group = group.group(part).map_err(|e| e.with_context(path))?;
696        }
697
698        let dataset = group
699            .dataset(parts[parts.len() - 1])
700            .map_err(|e| e.with_context(path))?;
701        if Arc::ptr_eq(&dataset.context, &self.context) {
702            self.context
703                .dataset_path_cache
704                .lock()
705                .insert(normalized_path, dataset.template());
706        }
707        Ok(dataset)
708    }
709
710    /// Convenience: get a group at a path like "/group1/subgroup".
711    pub fn group(&self, path: &str) -> Result<Group> {
712        let parts: Vec<&str> = path
713            .trim_start_matches('/')
714            .split('/')
715            .filter(|s| !s.is_empty())
716            .collect();
717
718        let mut group = self.root_group()?;
719        for &part in &parts {
720            group = group.group(part)?;
721        }
722
723        Ok(group)
724    }
725}
726
727#[cfg(test)]
728mod tests {
729    use super::*;
730
731    #[test]
732    fn open_options_default() {
733        let opts = OpenOptions::default();
734        assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
735        assert_eq!(opts.chunk_cache_slots, 521);
736        assert!(opts.external_file_resolver.is_none());
737    }
738
739    #[test]
740    fn invalid_file() {
741        let data = b"this is not an HDF5 file";
742        let result = Hdf5File::from_bytes(data);
743        assert!(result.is_err());
744    }
745
746    #[test]
747    fn btree_v2_chunked_fixture_uses_btree_v2_index() {
748        use crate::messages::layout::{ChunkIndexing, DataLayout};
749
750        let path = Path::new(env!("CARGO_MANIFEST_DIR"))
751            .parent()
752            .unwrap()
753            .join("testdata/hdf5/btree_v2_chunked.h5");
754        if !path.exists() {
755            eprintln!("SKIPPED: fixture btree_v2_chunked.h5 not found");
756            return;
757        }
758
759        let file = Hdf5File::open(path).unwrap();
760        let dataset = file.dataset("/data").unwrap();
761        assert!(matches!(
762            dataset.layout,
763            DataLayout::Chunked {
764                chunk_indexing: Some(ChunkIndexing::BTreeV2),
765                ..
766            }
767        ));
768    }
769
770    #[test]
771    fn filesystem_external_file_resolver_reads_relative_file() {
772        let dir = tempfile::tempdir().unwrap();
773        let path = dir.path().join("raw.bin");
774        std::fs::write(&path, b"abcdef").unwrap();
775
776        let resolver = FilesystemExternalFileResolver::new(dir.path());
777        let storage = resolver
778            .resolve_external_file("raw.bin")
779            .unwrap()
780            .expect("raw file should resolve");
781        let bytes = storage.read_range(2, 3).unwrap();
782        assert_eq!(bytes.as_ref(), b"cde");
783    }
784
785    #[test]
786    fn filesystem_external_file_resolver_rejects_absolute_path() {
787        let dir = tempfile::tempdir().unwrap();
788        let path = dir.path().join("raw.bin");
789        std::fs::write(&path, b"abcdef").unwrap();
790
791        let resolver = FilesystemExternalFileResolver::new(dir.path());
792        let err = match resolver.resolve_external_file(path.to_str().unwrap()) {
793            Ok(_) => panic!("absolute external file path should be rejected"),
794            Err(err) => err,
795        };
796        assert!(err.to_string().contains("must be relative"));
797    }
798
799    #[test]
800    fn filesystem_external_file_resolver_rejects_parent_component() {
801        let dir = tempfile::tempdir().unwrap();
802        let resolver = FilesystemExternalFileResolver::new(dir.path());
803
804        let err = match resolver.resolve_external_file("../raw.bin") {
805            Ok(_) => panic!("parent external file path should be rejected"),
806            Err(err) => err,
807        };
808        assert!(err.to_string().contains("resolver base directory"));
809    }
810
811    #[cfg(unix)]
812    #[test]
813    fn filesystem_external_file_resolver_rejects_symlink_escape() {
814        use std::os::unix::fs::symlink;
815
816        let dir = tempfile::tempdir().unwrap();
817        let outside = tempfile::tempdir().unwrap();
818        let outside_path = outside.path().join("raw.bin");
819        std::fs::write(&outside_path, b"abcdef").unwrap();
820        symlink(&outside_path, dir.path().join("raw.bin")).unwrap();
821
822        let resolver = FilesystemExternalFileResolver::new(dir.path());
823        let err = match resolver.resolve_external_file("raw.bin") {
824            Ok(_) => panic!("symlink escape should be rejected"),
825            Err(err) => err,
826        };
827        assert!(err.to_string().contains("escapes"));
828    }
829
830    #[cfg(unix)]
831    #[test]
832    fn filesystem_external_file_resolver_rejects_symlink_inside_base() {
833        use std::os::unix::fs::symlink;
834
835        let dir = tempfile::tempdir().unwrap();
836        std::fs::write(dir.path().join("raw.bin"), b"abcdef").unwrap();
837        symlink("raw.bin", dir.path().join("link.bin")).unwrap();
838
839        let resolver = FilesystemExternalFileResolver::new(dir.path());
840        let err = match resolver.resolve_external_file("link.bin") {
841            Ok(_) => panic!("symlinks should be rejected even when they point inside the base"),
842            Err(err) => err,
843        };
844        assert!(err.to_string().contains("symlink"));
845    }
846
847    #[cfg(unix)]
848    #[test]
849    fn filesystem_external_file_resolver_rejects_symlink_directory_component() {
850        use std::os::unix::fs::symlink;
851
852        let dir = tempfile::tempdir().unwrap();
853        let real_dir = dir.path().join("real");
854        std::fs::create_dir(&real_dir).unwrap();
855        std::fs::write(real_dir.join("raw.bin"), b"abcdef").unwrap();
856        symlink("real", dir.path().join("linkdir")).unwrap();
857
858        let resolver = FilesystemExternalFileResolver::new(dir.path());
859        let err = match resolver.resolve_external_file("linkdir/raw.bin") {
860            Ok(_) => panic!("symlinked directory components should be rejected"),
861            Err(err) => err,
862        };
863        assert!(err.to_string().contains("symlink"));
864    }
865
866    #[test]
867    fn filesystem_external_link_resolver_rejects_absolute_path() {
868        let dir = tempfile::tempdir().unwrap();
869        let path = dir.path().join("linked.h5");
870        std::fs::write(&path, b"not really hdf5").unwrap();
871
872        let resolver = FilesystemExternalLinkResolver::new(dir.path());
873        let err = match resolver.resolve_external_link(path.to_str().unwrap()) {
874            Ok(_) => panic!("absolute external link path should be rejected"),
875            Err(err) => err,
876        };
877        assert!(err.to_string().contains("must be relative"));
878    }
879
880    #[test]
881    fn filesystem_external_link_resolver_rejects_parent_component() {
882        let dir = tempfile::tempdir().unwrap();
883        let resolver = FilesystemExternalLinkResolver::new(dir.path());
884
885        let err = match resolver.resolve_external_link("../linked.h5") {
886            Ok(_) => panic!("parent external link path should be rejected"),
887            Err(err) => err,
888        };
889        assert!(err.to_string().contains("resolver base directory"));
890    }
891
892    #[cfg(unix)]
893    #[test]
894    fn filesystem_external_link_resolver_rejects_symlink_escape() {
895        use std::os::unix::fs::symlink;
896
897        let dir = tempfile::tempdir().unwrap();
898        let outside = tempfile::tempdir().unwrap();
899        let outside_path = outside.path().join("linked.h5");
900        std::fs::write(&outside_path, b"not really hdf5").unwrap();
901        symlink(&outside_path, dir.path().join("linked.h5")).unwrap();
902
903        let resolver = FilesystemExternalLinkResolver::new(dir.path());
904        let err = match resolver.resolve_external_link("linked.h5") {
905            Ok(_) => panic!("symlink escape should be rejected"),
906            Err(err) => err,
907        };
908        assert!(err.to_string().contains("escapes"));
909    }
910}