Skip to main content

hdf5_reader/
group.rs

1use std::sync::Arc;
2
3use crate::attribute_api::{
4    collect_attribute_messages_storage, resolve_vlen_bytes_storage, Attribute,
5};
6use crate::btree_v1;
7use crate::btree_v2;
8use crate::checksum::jenkins_lookup3;
9use crate::dataset::Dataset;
10use crate::error::{Error, Result};
11use crate::fractal_heap::{FractalHeap, FractalHeapDirectBlockCache};
12use crate::io::Cursor;
13use crate::local_heap::LocalHeap;
14use crate::messages::datatype::VarLenKind;
15use crate::messages::link::{self, LinkMessage, LinkTarget};
16use crate::messages::link_info::LinkInfoMessage;
17use crate::messages::symbol_table_msg::SymbolTableMessage;
18use crate::messages::HdfMessage;
19use crate::storage::Storage;
20use crate::FileContext;
21
22/// A group within an HDF5 file.
23#[derive(Clone)]
24pub struct Group {
25    context: Arc<FileContext>,
26    pub(crate) name: String,
27    pub(crate) address: u64,
28    /// Address of the root group's object header, used for resolving soft links.
29    pub(crate) root_address: u64,
30}
31
32#[derive(Clone)]
33struct ChildEntry {
34    name: String,
35    location: ObjectLocation,
36}
37
38#[derive(Clone)]
39struct ObjectLocation {
40    context: Arc<FileContext>,
41    address: u64,
42    root_address: u64,
43}
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46enum ChildObjectKind {
47    Group,
48    Dataset,
49    Other,
50}
51
52impl Group {
53    /// Create a group from a known object header address.
54    pub(crate) fn new(
55        context: Arc<FileContext>,
56        address: u64,
57        name: String,
58        root_address: u64,
59    ) -> Self {
60        Group {
61            context,
62            name,
63            address,
64            root_address,
65        }
66    }
67
68    /// Group name.
69    pub fn name(&self) -> &str {
70        &self.name
71    }
72
73    /// Object header address of this group within the file.
74    pub fn address(&self) -> u64 {
75        self.address
76    }
77
78    /// Materialize the full file backing this group.
79    pub fn file_data(&self) -> Result<crate::storage::StorageBuffer> {
80        self.context.full_file_data()
81    }
82
83    /// Access the underlying random-access storage backend.
84    pub fn storage(&self) -> &dyn Storage {
85        self.context.storage.as_ref()
86    }
87
88    /// Size of file offsets in bytes.
89    pub fn offset_size(&self) -> u8 {
90        self.context.superblock.offset_size
91    }
92
93    /// Size of file lengths in bytes.
94    pub fn length_size(&self) -> u8 {
95        self.context.superblock.length_size
96    }
97
98    /// Parse (or retrieve from cache) the object header at the given address.
99    fn cached_header(&self, addr: u64) -> Result<Arc<crate::object_header::ObjectHeader>> {
100        self.context.get_or_parse_header(addr)
101    }
102
103    fn local_location(&self, address: u64) -> ObjectLocation {
104        ObjectLocation {
105            context: self.context.clone(),
106            address,
107            root_address: self.root_address,
108        }
109    }
110
111    /// List all child groups.
112    pub fn groups(&self) -> Result<Vec<Group>> {
113        let (groups, _) = self.resolve_member_objects()?;
114        Ok(groups)
115    }
116
117    /// List all child members, partitioned into groups and datasets.
118    pub fn members(&self) -> Result<(Vec<Group>, Vec<Dataset>)> {
119        self.resolve_member_objects()
120    }
121
122    fn resolve_member_objects(&self) -> Result<(Vec<Group>, Vec<Dataset>)> {
123        let children = self.resolve_children()?;
124        let mut groups = Vec::new();
125        let mut datasets = Vec::new();
126        for child in &children {
127            match self.child_object_kind(child)? {
128                ChildObjectKind::Group => {
129                    groups.push(Group::new(
130                        child.location.context.clone(),
131                        child.location.address,
132                        child.name.clone(),
133                        child.location.root_address,
134                    ));
135                }
136                ChildObjectKind::Dataset => {
137                    if let Some(dataset) = self.try_open_child_dataset(child)? {
138                        datasets.push(dataset);
139                    }
140                }
141                ChildObjectKind::Other => {}
142            }
143        }
144        Ok((groups, datasets))
145    }
146
147    /// Get a child group by name.
148    pub fn group(&self, name: &str) -> Result<Group> {
149        if let Some(child) = self.resolve_child(name)? {
150            return match self.child_object_kind(&child)? {
151                ChildObjectKind::Group => Ok(Group::new(
152                    child.location.context.clone(),
153                    child.location.address,
154                    child.name.clone(),
155                    child.location.root_address,
156                )),
157                ChildObjectKind::Dataset => Err(Error::GroupNotFound(format!(
158                    "'{}' is a dataset, not a group",
159                    name
160                ))),
161                ChildObjectKind::Other => {
162                    Err(Error::GroupNotFound(format!("'{}' is not a group", name)))
163                }
164            };
165        }
166        Err(Error::GroupNotFound(name.to_string()))
167    }
168
169    /// List all child datasets.
170    pub fn datasets(&self) -> Result<Vec<Dataset>> {
171        let (_, datasets) = self.resolve_member_objects()?;
172        Ok(datasets)
173    }
174
175    /// Get a child dataset by name.
176    pub fn dataset(&self, name: &str) -> Result<Dataset> {
177        if let Some(child) = self.resolve_child(name)? {
178            if let Some(dataset) = self.try_open_child_dataset(&child)? {
179                return Ok(dataset);
180            }
181            return Err(Error::DatasetNotFound(name.to_string()));
182        }
183        Err(Error::DatasetNotFound(name.to_string()))
184    }
185
186    /// List attributes on this group.
187    pub fn attributes(&self) -> Result<Vec<Attribute>> {
188        let mut header = (*self.cached_header(self.address)?).clone();
189        header.resolve_shared_messages_storage(
190            self.context.storage.as_ref(),
191            self.offset_size(),
192            self.length_size(),
193        )?;
194        Ok(collect_attribute_messages_storage(
195            &header,
196            self.context.storage.as_ref(),
197            self.offset_size(),
198            self.length_size(),
199            Some(self.context.filter_registry.as_ref()),
200        )?
201        .into_iter()
202        .map(|attr| {
203            let raw_data = match &attr.datatype {
204                crate::messages::datatype::Datatype::VarLen {
205                    base,
206                    kind: VarLenKind::String,
207                    ..
208                } if matches!(
209                    base.as_ref(),
210                    crate::messages::datatype::Datatype::FixedPoint { size: 1, .. }
211                ) && matches!(attr.dataspace.num_elements(), Ok(1)) =>
212                {
213                    resolve_vlen_bytes_storage(
214                        &attr.raw_data,
215                        self.context.storage.as_ref(),
216                        self.offset_size(),
217                        self.length_size(),
218                    )
219                    .unwrap_or_else(|| attr.raw_data.clone())
220                }
221                _ => attr.raw_data.clone(),
222            };
223            Attribute {
224                name: attr.name,
225                datatype: attr.datatype,
226                shape: match attr.dataspace.dataspace_type {
227                    crate::messages::dataspace::DataspaceType::Scalar => vec![],
228                    crate::messages::dataspace::DataspaceType::Null => vec![0],
229                    crate::messages::dataspace::DataspaceType::Simple => attr.dataspace.dims,
230                },
231                raw_data,
232            }
233        })
234        .collect())
235    }
236
237    /// Find an attribute by name.
238    pub fn attribute(&self, name: &str) -> Result<Attribute> {
239        let attrs = self.attributes()?;
240        attrs
241            .into_iter()
242            .find(|a| a.name == name)
243            .ok_or_else(|| Error::AttributeNotFound(name.to_string()))
244    }
245
246    /// Resolve children from the object header.
247    /// Handles both old-style (symbol table) and new-style (link messages) groups.
248    fn resolve_children(&self) -> Result<Vec<ChildEntry>> {
249        self.resolve_children_with_link_depth(0)
250    }
251
252    fn resolve_child(&self, name: &str) -> Result<Option<ChildEntry>> {
253        self.resolve_child_with_link_depth(name, 0)
254    }
255
256    fn resolve_child_with_link_depth(
257        &self,
258        name: &str,
259        link_depth: u32,
260    ) -> Result<Option<ChildEntry>> {
261        let header = self.cached_header(self.address)?;
262
263        let mut link_info: Option<LinkInfoMessage> = None;
264        let mut matching_compact_link: Option<LinkMessage> = None;
265
266        for msg in &header.messages {
267            match msg {
268                HdfMessage::SymbolTable(st) => {
269                    return Ok(self
270                        .resolve_old_style_group_storage(st)?
271                        .into_iter()
272                        .find(|child| child.name == name));
273                }
274                HdfMessage::Link(link) if link.name == name => {
275                    matching_compact_link = Some(link.clone());
276                }
277                HdfMessage::LinkInfo(li) => {
278                    link_info = Some(li.clone());
279                }
280                _ => {}
281            }
282        }
283
284        if let Some(link) = matching_compact_link {
285            if let Some(child) = self.resolve_link_message_target(&link, link_depth)? {
286                return Ok(Some(child));
287            }
288        }
289
290        if let Some(ref li) = link_info {
291            if !Cursor::is_undefined_offset(li.fractal_heap_address, self.offset_size()) {
292                return self.resolve_dense_link_storage(li, name, link_depth);
293            }
294        }
295
296        Ok(None)
297    }
298
299    /// Resolve children with a soft-link depth counter to prevent cycles.
300    fn resolve_children_with_link_depth(&self, link_depth: u32) -> Result<Vec<ChildEntry>> {
301        let header = self.cached_header(self.address)?;
302
303        let mut children = Vec::new();
304
305        // Check for old-style groups (symbol table message)
306        let mut found_symbol_table = false;
307        // Check for new-style groups (link messages)
308        let mut link_info: Option<LinkInfoMessage> = None;
309        let mut links: Vec<LinkMessage> = Vec::new();
310
311        for msg in &header.messages {
312            match msg {
313                HdfMessage::SymbolTable(st) => {
314                    found_symbol_table = true;
315                    children = self.resolve_old_style_group_storage(st)?;
316                }
317                HdfMessage::Link(link) => {
318                    links.push(link.clone());
319                }
320                HdfMessage::LinkInfo(li) => {
321                    link_info = Some(li.clone());
322                }
323                _ => {}
324            }
325        }
326
327        if !found_symbol_table {
328            // New-style group: use compact links from header messages
329            self.resolve_link_targets(&links, link_depth, &mut children)?;
330
331            // Dense-link storage can coexist with compact links, so merge both.
332            if let Some(ref li) = link_info {
333                if !Cursor::is_undefined_offset(li.fractal_heap_address, self.offset_size()) {
334                    for child in self.resolve_dense_links_storage(li, link_depth)? {
335                        let is_duplicate = children.iter().any(|existing| {
336                            existing.name == child.name
337                                && existing.location.address == child.location.address
338                                && Arc::ptr_eq(&existing.location.context, &child.location.context)
339                        });
340                        if !is_duplicate {
341                            children.push(child);
342                        }
343                    }
344                }
345            }
346        }
347
348        Ok(children)
349    }
350
351    /// Resolve link targets (hard and soft), appending to `children`.
352    fn resolve_link_targets(
353        &self,
354        links: &[LinkMessage],
355        link_depth: u32,
356        children: &mut Vec<ChildEntry>,
357    ) -> Result<()> {
358        for link in links {
359            if let Some(child) = self.resolve_link_message_target(link, link_depth)? {
360                children.push(child);
361            }
362        }
363        Ok(())
364    }
365
366    fn resolve_link_message_target(
367        &self,
368        link: &LinkMessage,
369        link_depth: u32,
370    ) -> Result<Option<ChildEntry>> {
371        match &link.target {
372            LinkTarget::Hard { address } => Ok(Some(ChildEntry {
373                name: link.name.clone(),
374                location: self.local_location(*address),
375            })),
376            LinkTarget::Soft { path } => Ok(self
377                .resolve_soft_link_depth(path, link_depth)
378                .ok()
379                .map(|location| ChildEntry {
380                    name: link.name.clone(),
381                    location,
382                })),
383            LinkTarget::External { filename, path } => Ok(self
384                .resolve_external_link_depth(filename, path, link_depth)?
385                .map(|location| ChildEntry {
386                    name: link.name.clone(),
387                    location,
388                })),
389        }
390    }
391
392    fn resolve_old_style_group_storage(&self, st: &SymbolTableMessage) -> Result<Vec<ChildEntry>> {
393        let heap = LocalHeap::parse_at_storage(
394            self.context.storage.as_ref(),
395            st.heap_address,
396            self.offset_size(),
397            self.length_size(),
398        )?;
399
400        let leaves = btree_v1::collect_btree_v1_leaves_storage(
401            self.context.storage.as_ref(),
402            st.btree_address,
403            self.offset_size(),
404            self.length_size(),
405            None,
406            &[],
407            None,
408        )?;
409
410        let mut children = Vec::new();
411        for (_key, snod_address) in &leaves {
412            let header_len = 8 + 2 * usize::from(self.offset_size());
413            let prefix = self.context.read_range(*snod_address, header_len)?;
414            let mut prefix_cursor = Cursor::new(prefix.as_ref());
415            let sig = prefix_cursor.read_bytes(4)?;
416            if sig != *b"SNOD" {
417                return Err(Error::InvalidData(format!(
418                    "expected SNOD signature at offset {:#x}",
419                    snod_address
420                )));
421            }
422            let version = prefix_cursor.read_u8()?;
423            if version != 1 {
424                return Err(Error::InvalidData(format!(
425                    "unsupported symbol table node version {}",
426                    version
427                )));
428            }
429            prefix_cursor.skip(1)?;
430            let num_symbols = prefix_cursor.read_u16_le()?;
431            let node_len =
432                8 + usize::from(num_symbols) * (2 * usize::from(self.offset_size()) + 4 + 4 + 16);
433            let bytes = self.context.read_range(*snod_address, node_len)?;
434            let mut cursor = Cursor::new(bytes.as_ref());
435            let snod = crate::symbol_table::SymbolTableNode::parse(
436                &mut cursor,
437                self.offset_size(),
438                self.length_size(),
439            )?;
440
441            for entry in &snod.entries {
442                let name =
443                    heap.get_string_storage(entry.link_name_offset, self.context.storage.as_ref())?;
444                children.push(ChildEntry {
445                    name,
446                    location: self.local_location(entry.object_header_address),
447                });
448            }
449        }
450
451        Ok(children)
452    }
453
454    fn resolve_dense_links_storage(
455        &self,
456        link_info: &LinkInfoMessage,
457        link_depth: u32,
458    ) -> Result<Vec<ChildEntry>> {
459        let heap = FractalHeap::parse_at_storage(
460            self.context.storage.as_ref(),
461            link_info.fractal_heap_address,
462            self.offset_size(),
463            self.length_size(),
464        )?;
465
466        let btree_header = btree_v2::BTreeV2Header::parse_at_storage(
467            self.context.storage.as_ref(),
468            link_info.btree_name_index_address,
469            self.offset_size(),
470            self.length_size(),
471        )?;
472
473        let records = btree_v2::collect_btree_v2_records_storage(
474            self.context.storage.as_ref(),
475            &btree_header,
476            self.offset_size(),
477            self.length_size(),
478            None,
479            &[],
480            None,
481        )?;
482
483        let mut children = Vec::new();
484        let mut direct_block_cache = FractalHeapDirectBlockCache::default();
485        for record in &records {
486            let heap_id = match record {
487                btree_v2::BTreeV2Record::LinkNameHash { heap_id, .. }
488                | btree_v2::BTreeV2Record::CreationOrder { heap_id, .. } => heap_id,
489                _ => continue,
490            };
491
492            let managed_bytes = heap.get_object_storage_cached_with_registry(
493                heap_id,
494                self.context.storage.as_ref(),
495                self.offset_size(),
496                self.length_size(),
497                &mut direct_block_cache,
498                Some(self.context.filter_registry.as_ref()),
499            )?;
500
501            let mut link_cursor = Cursor::new(&managed_bytes);
502            let link_msg = link::parse(
503                &mut link_cursor,
504                self.offset_size(),
505                self.length_size(),
506                managed_bytes.len(),
507            )?;
508
509            match &link_msg.target {
510                LinkTarget::Hard { address } => {
511                    children.push(ChildEntry {
512                        name: link_msg.name.clone(),
513                        location: self.local_location(*address),
514                    });
515                }
516                LinkTarget::Soft { path } => {
517                    if let Ok(location) = self.resolve_soft_link_depth(path, link_depth) {
518                        children.push(ChildEntry {
519                            name: link_msg.name.clone(),
520                            location,
521                        });
522                    }
523                }
524                LinkTarget::External { filename, path } => {
525                    if let Some(location) =
526                        self.resolve_external_link_depth(filename, path, link_depth)?
527                    {
528                        children.push(ChildEntry {
529                            name: link_msg.name.clone(),
530                            location,
531                        });
532                    }
533                }
534            }
535        }
536
537        Ok(children)
538    }
539
540    fn resolve_dense_link_storage(
541        &self,
542        link_info: &LinkInfoMessage,
543        name: &str,
544        link_depth: u32,
545    ) -> Result<Option<ChildEntry>> {
546        let heap = FractalHeap::parse_at_storage(
547            self.context.storage.as_ref(),
548            link_info.fractal_heap_address,
549            self.offset_size(),
550            self.length_size(),
551        )?;
552
553        let btree_header = btree_v2::BTreeV2Header::parse_at_storage(
554            self.context.storage.as_ref(),
555            link_info.btree_name_index_address,
556            self.offset_size(),
557            self.length_size(),
558        )?;
559
560        let records = btree_v2::collect_btree_v2_link_name_hash_records_storage(
561            self.context.storage.as_ref(),
562            &btree_header,
563            self.offset_size(),
564            self.length_size(),
565            jenkins_lookup3(name.as_bytes()),
566        )?;
567
568        let mut direct_block_cache = FractalHeapDirectBlockCache::default();
569        for record in &records {
570            let btree_v2::BTreeV2Record::LinkNameHash { heap_id, .. } = record else {
571                continue;
572            };
573
574            let managed_bytes = heap.get_object_storage_cached_with_registry(
575                heap_id,
576                self.context.storage.as_ref(),
577                self.offset_size(),
578                self.length_size(),
579                &mut direct_block_cache,
580                Some(self.context.filter_registry.as_ref()),
581            )?;
582
583            let mut link_cursor = Cursor::new(&managed_bytes);
584            let link_msg = link::parse(
585                &mut link_cursor,
586                self.offset_size(),
587                self.length_size(),
588                managed_bytes.len(),
589            )?;
590            if link_msg.name == name {
591                return self.resolve_link_message_target(&link_msg, link_depth);
592            }
593        }
594
595        Ok(None)
596    }
597
598    pub fn child_name_by_address(&self, address: u64) -> Result<Option<String>> {
599        Ok(self
600            .resolve_children()?
601            .into_iter()
602            .find(|child| child.location.address == address)
603            .map(|child| child.name))
604    }
605
606    fn child_context(&self, child: &ChildEntry) -> String {
607        format!("child '{}' at {:#x}", child.name, child.location.address)
608    }
609
610    fn child_object_kind(&self, child: &ChildEntry) -> Result<ChildObjectKind> {
611        let header = self
612            .cached_child_header(child)
613            .map_err(|err| err.with_context(self.child_context(child)))?;
614
615        Ok(classify_child_header(header.as_ref()))
616    }
617
618    fn try_open_child_dataset(&self, child: &ChildEntry) -> Result<Option<Dataset>> {
619        let header = self
620            .cached_child_header(child)
621            .map_err(|err| err.with_context(self.child_context(child)))?;
622
623        if classify_child_header(header.as_ref()) != ChildObjectKind::Dataset {
624            return Ok(None);
625        }
626
627        Dataset::from_parsed_header(
628            crate::dataset::DatasetParseContext {
629                context: child.location.context.clone(),
630            },
631            child.location.address,
632            child.name.clone(),
633            header.as_ref(),
634        )
635        .map(Some)
636        .map_err(|err| err.with_context(self.child_context(child)))
637    }
638
639    fn cached_child_header(
640        &self,
641        child: &ChildEntry,
642    ) -> Result<Arc<crate::object_header::ObjectHeader>> {
643        child
644            .location
645            .context
646            .get_or_parse_header(child.location.address)
647    }
648
649    /// Maximum nesting depth for soft link resolution.
650    const MAX_SOFT_LINK_DEPTH: u32 = 16;
651
652    fn resolve_soft_link_depth(&self, path: &str, depth: u32) -> Result<ObjectLocation> {
653        self.resolve_path_location(path, depth, "soft link")
654    }
655
656    fn resolve_external_link_depth(
657        &self,
658        filename: &str,
659        path: &str,
660        depth: u32,
661    ) -> Result<Option<ObjectLocation>> {
662        if depth >= Self::MAX_SOFT_LINK_DEPTH {
663            return Err(Error::Other(format!(
664                "external link resolution exceeded maximum depth ({}) at '{}:{}'",
665                Self::MAX_SOFT_LINK_DEPTH,
666                filename,
667                path,
668            )));
669        }
670
671        let Some(resolver) = self.context.external_link_resolver.as_ref() else {
672            return Ok(None);
673        };
674        let Some(file) = resolver.resolve_external_link(filename)? else {
675            return Ok(None);
676        };
677        let root = file.root_group()?;
678        Ok(Some(root.resolve_path_location(
679            path,
680            depth + 1,
681            "external link",
682        )?))
683    }
684
685    fn resolve_path_location(
686        &self,
687        path: &str,
688        depth: u32,
689        link_kind: &str,
690    ) -> Result<ObjectLocation> {
691        if depth >= Self::MAX_SOFT_LINK_DEPTH {
692            return Err(Error::Other(format!(
693                "{} resolution exceeded maximum depth ({}) — possible cycle at '{}'",
694                link_kind,
695                Self::MAX_SOFT_LINK_DEPTH,
696                path,
697            )));
698        }
699
700        let parts: Vec<&str> = path
701            .trim_matches('/')
702            .split('/')
703            .filter(|s| !s.is_empty())
704            .collect();
705
706        if parts.is_empty() {
707            return Ok(self.local_location(self.root_address));
708        }
709
710        let start_addr = if path.starts_with('/') {
711            self.root_address
712        } else {
713            self.address
714        };
715
716        let mut current_group = Group::new(
717            self.context.clone(),
718            start_addr,
719            String::new(),
720            self.root_address,
721        );
722
723        for &part in &parts[..parts.len() - 1] {
724            current_group = current_group.group(part)?;
725        }
726
727        let target_name = parts[parts.len() - 1];
728        if let Some(child) = current_group.resolve_child_with_link_depth(target_name, depth + 1)? {
729            return Ok(child.location);
730        }
731
732        Err(Error::Other(format!(
733            "{} target '{}' not found",
734            link_kind, path
735        )))
736    }
737}
738
739fn classify_child_header(header: &crate::object_header::ObjectHeader) -> ChildObjectKind {
740    let mut has_dataset_message = false;
741
742    for msg in &header.messages {
743        match msg {
744            HdfMessage::SymbolTable(_)
745            | HdfMessage::Link(_)
746            | HdfMessage::LinkInfo(_)
747            | HdfMessage::GroupInfo(_) => return ChildObjectKind::Group,
748            HdfMessage::Dataspace(_)
749            | HdfMessage::DataLayout(_)
750            | HdfMessage::FillValue(_)
751            | HdfMessage::FilterPipeline(_) => has_dataset_message = true,
752            _ => {}
753        }
754    }
755
756    if has_dataset_message {
757        ChildObjectKind::Dataset
758    } else {
759        ChildObjectKind::Other
760    }
761}