Skip to main content

hdf5_reader/
group.rs

1use std::sync::Arc;
2
3use crate::attribute_api::{
4    collect_attribute_messages_storage, resolve_vlen_bytes_storage, Attribute,
5};
6use crate::btree_v1;
7use crate::btree_v2;
8use crate::dataset::Dataset;
9use crate::error::{Error, Result};
10use crate::fractal_heap::FractalHeap;
11use crate::io::Cursor;
12use crate::local_heap::LocalHeap;
13use crate::messages::link::{self, LinkMessage, LinkTarget};
14use crate::messages::link_info::LinkInfoMessage;
15use crate::messages::symbol_table_msg::SymbolTableMessage;
16use crate::messages::HdfMessage;
17use crate::storage::Storage;
18use crate::FileContext;
19
20/// A group within an HDF5 file.
21#[derive(Clone)]
22pub struct Group {
23    context: Arc<FileContext>,
24    pub(crate) name: String,
25    pub(crate) address: u64,
26    /// Address of the root group's object header, used for resolving soft links.
27    pub(crate) root_address: u64,
28}
29
30#[derive(Clone)]
31struct ChildEntry {
32    name: String,
33    location: ObjectLocation,
34}
35
36#[derive(Clone)]
37struct ObjectLocation {
38    context: Arc<FileContext>,
39    address: u64,
40    root_address: u64,
41}
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44enum ChildObjectKind {
45    Group,
46    Dataset,
47    Other,
48}
49
50impl Group {
51    /// Create a group from a known object header address.
52    pub(crate) fn new(
53        context: Arc<FileContext>,
54        address: u64,
55        name: String,
56        root_address: u64,
57    ) -> Self {
58        Group {
59            context,
60            name,
61            address,
62            root_address,
63        }
64    }
65
66    /// Group name.
67    pub fn name(&self) -> &str {
68        &self.name
69    }
70
71    /// Object header address of this group within the file.
72    pub fn address(&self) -> u64 {
73        self.address
74    }
75
76    /// Materialize the full file backing this group.
77    pub fn file_data(&self) -> Result<crate::storage::StorageBuffer> {
78        self.context.full_file_data()
79    }
80
81    /// Access the underlying random-access storage backend.
82    pub fn storage(&self) -> &dyn Storage {
83        self.context.storage.as_ref()
84    }
85
86    /// Size of file offsets in bytes.
87    pub fn offset_size(&self) -> u8 {
88        self.context.superblock.offset_size
89    }
90
91    /// Size of file lengths in bytes.
92    pub fn length_size(&self) -> u8 {
93        self.context.superblock.length_size
94    }
95
96    /// Parse (or retrieve from cache) the object header at the given address.
97    fn cached_header(&self, addr: u64) -> Result<Arc<crate::object_header::ObjectHeader>> {
98        self.context.get_or_parse_header(addr)
99    }
100
101    fn local_location(&self, address: u64) -> ObjectLocation {
102        ObjectLocation {
103            context: self.context.clone(),
104            address,
105            root_address: self.root_address,
106        }
107    }
108
109    /// List all child groups.
110    pub fn groups(&self) -> Result<Vec<Group>> {
111        let (groups, _) = self.resolve_member_objects()?;
112        Ok(groups)
113    }
114
115    /// List all child members, partitioned into groups and datasets.
116    pub fn members(&self) -> Result<(Vec<Group>, Vec<Dataset>)> {
117        self.resolve_member_objects()
118    }
119
120    fn resolve_member_objects(&self) -> Result<(Vec<Group>, Vec<Dataset>)> {
121        let children = self.resolve_children()?;
122        let mut groups = Vec::new();
123        let mut datasets = Vec::new();
124        for child in &children {
125            match self.child_object_kind(child)? {
126                ChildObjectKind::Group | ChildObjectKind::Other => {
127                    groups.push(Group::new(
128                        child.location.context.clone(),
129                        child.location.address,
130                        child.name.clone(),
131                        child.location.root_address,
132                    ));
133                }
134                ChildObjectKind::Dataset => {
135                    if let Some(dataset) = self.try_open_child_dataset(child)? {
136                        datasets.push(dataset);
137                    }
138                }
139            }
140        }
141        Ok((groups, datasets))
142    }
143
144    /// Get a child group by name.
145    pub fn group(&self, name: &str) -> Result<Group> {
146        let children = self.resolve_children()?;
147        for child in &children {
148            if child.name == name {
149                return match self.child_object_kind(child)? {
150                    ChildObjectKind::Group => Ok(Group::new(
151                        child.location.context.clone(),
152                        child.location.address,
153                        child.name.clone(),
154                        child.location.root_address,
155                    )),
156                    ChildObjectKind::Dataset => Err(Error::GroupNotFound(format!(
157                        "'{}' is a dataset, not a group",
158                        name
159                    ))),
160                    ChildObjectKind::Other => Ok(Group::new(
161                        child.location.context.clone(),
162                        child.location.address,
163                        child.name.clone(),
164                        child.location.root_address,
165                    )),
166                };
167            }
168        }
169        Err(Error::GroupNotFound(name.to_string()))
170    }
171
172    /// List all child datasets.
173    pub fn datasets(&self) -> Result<Vec<Dataset>> {
174        let (_, datasets) = self.resolve_member_objects()?;
175        Ok(datasets)
176    }
177
178    /// Get a child dataset by name.
179    pub fn dataset(&self, name: &str) -> Result<Dataset> {
180        let children = self.resolve_children()?;
181        for child in &children {
182            if child.name == name {
183                if let Some(dataset) = self.try_open_child_dataset(child)? {
184                    return Ok(dataset);
185                }
186                return Err(Error::DatasetNotFound(name.to_string()));
187            }
188        }
189        Err(Error::DatasetNotFound(name.to_string()))
190    }
191
192    /// List attributes on this group.
193    pub fn attributes(&self) -> Result<Vec<Attribute>> {
194        let mut header = (*self.cached_header(self.address)?).clone();
195        header.resolve_shared_messages_storage(
196            self.context.storage.as_ref(),
197            self.offset_size(),
198            self.length_size(),
199        )?;
200        Ok(collect_attribute_messages_storage(
201            &header,
202            self.context.storage.as_ref(),
203            self.offset_size(),
204            self.length_size(),
205        )?
206        .into_iter()
207        .map(|attr| {
208            let raw_data = match &attr.datatype {
209                crate::messages::datatype::Datatype::VarLen { base }
210                    if matches!(
211                        base.as_ref(),
212                        crate::messages::datatype::Datatype::FixedPoint { size: 1, .. }
213                    ) && attr.dataspace.num_elements() == 1 =>
214                {
215                    resolve_vlen_bytes_storage(
216                        &attr.raw_data,
217                        self.context.storage.as_ref(),
218                        self.offset_size(),
219                        self.length_size(),
220                    )
221                    .unwrap_or_else(|| attr.raw_data.clone())
222                }
223                _ => attr.raw_data.clone(),
224            };
225            Attribute {
226                name: attr.name,
227                datatype: attr.datatype,
228                shape: match attr.dataspace.dataspace_type {
229                    crate::messages::dataspace::DataspaceType::Scalar => vec![],
230                    crate::messages::dataspace::DataspaceType::Null => vec![0],
231                    crate::messages::dataspace::DataspaceType::Simple => attr.dataspace.dims,
232                },
233                raw_data,
234            }
235        })
236        .collect())
237    }
238
239    /// Find an attribute by name.
240    pub fn attribute(&self, name: &str) -> Result<Attribute> {
241        let attrs = self.attributes()?;
242        attrs
243            .into_iter()
244            .find(|a| a.name == name)
245            .ok_or_else(|| Error::AttributeNotFound(name.to_string()))
246    }
247
248    /// Resolve children from the object header.
249    /// Handles both old-style (symbol table) and new-style (link messages) groups.
250    fn resolve_children(&self) -> Result<Vec<ChildEntry>> {
251        self.resolve_children_with_link_depth(0)
252    }
253
254    /// Resolve children with a soft-link depth counter to prevent cycles.
255    fn resolve_children_with_link_depth(&self, link_depth: u32) -> Result<Vec<ChildEntry>> {
256        let header = self.cached_header(self.address)?;
257
258        let mut children = Vec::new();
259
260        // Check for old-style groups (symbol table message)
261        let mut found_symbol_table = false;
262        // Check for new-style groups (link messages)
263        let mut link_info: Option<LinkInfoMessage> = None;
264        let mut links: Vec<LinkMessage> = Vec::new();
265
266        for msg in &header.messages {
267            match msg {
268                HdfMessage::SymbolTable(st) => {
269                    found_symbol_table = true;
270                    children = self.resolve_old_style_group_storage(st)?;
271                }
272                HdfMessage::Link(link) => {
273                    links.push(link.clone());
274                }
275                HdfMessage::LinkInfo(li) => {
276                    link_info = Some(li.clone());
277                }
278                _ => {}
279            }
280        }
281
282        if !found_symbol_table {
283            // New-style group: use compact links from header messages
284            self.resolve_link_targets(&links, link_depth, &mut children)?;
285
286            // Dense-link storage can coexist with compact links, so merge both.
287            if let Some(ref li) = link_info {
288                if !Cursor::is_undefined_offset(li.fractal_heap_address, self.offset_size()) {
289                    for child in self.resolve_dense_links_storage(li, link_depth)? {
290                        let is_duplicate = children.iter().any(|existing| {
291                            existing.name == child.name
292                                && existing.location.address == child.location.address
293                                && Arc::ptr_eq(&existing.location.context, &child.location.context)
294                        });
295                        if !is_duplicate {
296                            children.push(child);
297                        }
298                    }
299                }
300            }
301        }
302
303        Ok(children)
304    }
305
306    /// Resolve link targets (hard and soft), appending to `children`.
307    fn resolve_link_targets(
308        &self,
309        links: &[LinkMessage],
310        link_depth: u32,
311        children: &mut Vec<ChildEntry>,
312    ) -> Result<()> {
313        for link in links {
314            match &link.target {
315                LinkTarget::Hard { address } => {
316                    children.push(ChildEntry {
317                        name: link.name.clone(),
318                        location: self.local_location(*address),
319                    });
320                }
321                LinkTarget::Soft { path } => {
322                    if let Ok(location) = self.resolve_soft_link_depth(path, link_depth) {
323                        children.push(ChildEntry {
324                            name: link.name.clone(),
325                            location,
326                        });
327                    }
328                }
329                LinkTarget::External { filename, path } => {
330                    if let Some(location) =
331                        self.resolve_external_link_depth(filename, path, link_depth)?
332                    {
333                        children.push(ChildEntry {
334                            name: link.name.clone(),
335                            location,
336                        });
337                    }
338                }
339            }
340        }
341        Ok(())
342    }
343
344    /// Resolve old-style group children via B-tree v1 + local heap.
345    #[allow(dead_code)]
346    fn resolve_old_style_group(
347        &self,
348        st: &SymbolTableMessage,
349        file_data: &[u8],
350    ) -> Result<Vec<ChildEntry>> {
351        let mut heap_cursor = Cursor::new(file_data);
352        heap_cursor.set_position(st.heap_address);
353        let heap = LocalHeap::parse(&mut heap_cursor, self.offset_size(), self.length_size())?;
354
355        let leaves = btree_v1::collect_btree_v1_leaves(
356            file_data,
357            st.btree_address,
358            self.offset_size(),
359            self.length_size(),
360            None,
361            &[],
362            None,
363        )?;
364
365        let mut children = Vec::new();
366        for (_key, snod_address) in &leaves {
367            let mut cursor = Cursor::new(file_data);
368            cursor.set_position(*snod_address);
369            let snod = crate::symbol_table::SymbolTableNode::parse(
370                &mut cursor,
371                self.offset_size(),
372                self.length_size(),
373            )?;
374
375            for entry in &snod.entries {
376                let name = heap.get_string(entry.link_name_offset, file_data)?;
377                children.push(ChildEntry {
378                    name,
379                    location: self.local_location(entry.object_header_address),
380                });
381            }
382        }
383
384        Ok(children)
385    }
386
387    fn resolve_old_style_group_storage(&self, st: &SymbolTableMessage) -> Result<Vec<ChildEntry>> {
388        let heap = LocalHeap::parse_at_storage(
389            self.context.storage.as_ref(),
390            st.heap_address,
391            self.offset_size(),
392            self.length_size(),
393        )?;
394
395        let leaves = btree_v1::collect_btree_v1_leaves_storage(
396            self.context.storage.as_ref(),
397            st.btree_address,
398            self.offset_size(),
399            self.length_size(),
400            None,
401            &[],
402            None,
403        )?;
404
405        let mut children = Vec::new();
406        for (_key, snod_address) in &leaves {
407            let header_len = 8 + 2 * usize::from(self.offset_size());
408            let prefix = self.context.read_range(*snod_address, header_len)?;
409            let mut prefix_cursor = Cursor::new(prefix.as_ref());
410            let sig = prefix_cursor.read_bytes(4)?;
411            if sig != *b"SNOD" {
412                return Err(Error::InvalidData(format!(
413                    "expected SNOD signature at offset {:#x}",
414                    snod_address
415                )));
416            }
417            let version = prefix_cursor.read_u8()?;
418            if version != 1 {
419                return Err(Error::InvalidData(format!(
420                    "unsupported symbol table node version {}",
421                    version
422                )));
423            }
424            prefix_cursor.skip(1)?;
425            let num_symbols = prefix_cursor.read_u16_le()?;
426            let node_len =
427                8 + usize::from(num_symbols) * (2 * usize::from(self.offset_size()) + 4 + 4 + 16);
428            let bytes = self.context.read_range(*snod_address, node_len)?;
429            let mut cursor = Cursor::new(bytes.as_ref());
430            let snod = crate::symbol_table::SymbolTableNode::parse(
431                &mut cursor,
432                self.offset_size(),
433                self.length_size(),
434            )?;
435
436            for entry in &snod.entries {
437                let name =
438                    heap.get_string_storage(entry.link_name_offset, self.context.storage.as_ref())?;
439                children.push(ChildEntry {
440                    name,
441                    location: self.local_location(entry.object_header_address),
442                });
443            }
444        }
445
446        Ok(children)
447    }
448
449    /// Resolve dense links from a fractal heap + B-tree v2.
450    #[allow(dead_code)]
451    fn resolve_dense_links(
452        &self,
453        link_info: &LinkInfoMessage,
454        link_depth: u32,
455        file_data: &[u8],
456    ) -> Result<Vec<ChildEntry>> {
457        let mut heap_cursor = Cursor::new(file_data);
458        heap_cursor.set_position(link_info.fractal_heap_address);
459        let heap = FractalHeap::parse(&mut heap_cursor, self.offset_size(), self.length_size())?;
460
461        let mut btree_cursor = Cursor::new(file_data);
462        btree_cursor.set_position(link_info.btree_name_index_address);
463        let btree_header = btree_v2::BTreeV2Header::parse(
464            &mut btree_cursor,
465            self.offset_size(),
466            self.length_size(),
467        )?;
468
469        let records = btree_v2::collect_btree_v2_records(
470            file_data,
471            &btree_header,
472            self.offset_size(),
473            self.length_size(),
474            None,
475            &[],
476            None,
477        )?;
478
479        let mut children = Vec::new();
480        for record in &records {
481            let heap_id = match record {
482                btree_v2::BTreeV2Record::LinkNameHash { heap_id, .. } => heap_id,
483                btree_v2::BTreeV2Record::CreationOrder { heap_id, .. } => heap_id,
484                _ => continue,
485            };
486
487            let managed_bytes =
488                heap.get_object(heap_id, file_data, self.offset_size(), self.length_size())?;
489
490            let mut link_cursor = Cursor::new(&managed_bytes);
491            let link_msg = link::parse(
492                &mut link_cursor,
493                self.offset_size(),
494                self.length_size(),
495                managed_bytes.len(),
496            )?;
497
498            match &link_msg.target {
499                LinkTarget::Hard { address } => {
500                    children.push(ChildEntry {
501                        name: link_msg.name.clone(),
502                        location: self.local_location(*address),
503                    });
504                }
505                LinkTarget::Soft { path } => {
506                    if let Ok(location) = self.resolve_soft_link_depth(path, link_depth) {
507                        children.push(ChildEntry {
508                            name: link_msg.name.clone(),
509                            location,
510                        });
511                    }
512                }
513                LinkTarget::External { filename, path } => {
514                    if let Some(location) =
515                        self.resolve_external_link_depth(filename, path, link_depth)?
516                    {
517                        children.push(ChildEntry {
518                            name: link_msg.name.clone(),
519                            location,
520                        });
521                    }
522                }
523            }
524        }
525
526        Ok(children)
527    }
528
529    fn resolve_dense_links_storage(
530        &self,
531        link_info: &LinkInfoMessage,
532        link_depth: u32,
533    ) -> Result<Vec<ChildEntry>> {
534        let heap = FractalHeap::parse_at_storage(
535            self.context.storage.as_ref(),
536            link_info.fractal_heap_address,
537            self.offset_size(),
538            self.length_size(),
539        )?;
540
541        let btree_header = btree_v2::BTreeV2Header::parse_at_storage(
542            self.context.storage.as_ref(),
543            link_info.btree_name_index_address,
544            self.offset_size(),
545            self.length_size(),
546        )?;
547
548        let records = btree_v2::collect_btree_v2_records_storage(
549            self.context.storage.as_ref(),
550            &btree_header,
551            self.offset_size(),
552            self.length_size(),
553            None,
554            &[],
555            None,
556        )?;
557
558        let mut children = Vec::new();
559        for record in &records {
560            let heap_id = match record {
561                btree_v2::BTreeV2Record::LinkNameHash { heap_id, .. }
562                | btree_v2::BTreeV2Record::CreationOrder { heap_id, .. } => heap_id,
563                _ => continue,
564            };
565
566            let managed_bytes = heap.get_object_storage(
567                heap_id,
568                self.context.storage.as_ref(),
569                self.offset_size(),
570                self.length_size(),
571            )?;
572
573            let mut link_cursor = Cursor::new(&managed_bytes);
574            let link_msg = link::parse(
575                &mut link_cursor,
576                self.offset_size(),
577                self.length_size(),
578                managed_bytes.len(),
579            )?;
580
581            match &link_msg.target {
582                LinkTarget::Hard { address } => {
583                    children.push(ChildEntry {
584                        name: link_msg.name.clone(),
585                        location: self.local_location(*address),
586                    });
587                }
588                LinkTarget::Soft { path } => {
589                    if let Ok(location) = self.resolve_soft_link_depth(path, link_depth) {
590                        children.push(ChildEntry {
591                            name: link_msg.name.clone(),
592                            location,
593                        });
594                    }
595                }
596                LinkTarget::External { filename, path } => {
597                    if let Some(location) =
598                        self.resolve_external_link_depth(filename, path, link_depth)?
599                    {
600                        children.push(ChildEntry {
601                            name: link_msg.name.clone(),
602                            location,
603                        });
604                    }
605                }
606            }
607        }
608
609        Ok(children)
610    }
611
612    pub fn child_name_by_address(&self, address: u64) -> Result<Option<String>> {
613        Ok(self
614            .resolve_children()?
615            .into_iter()
616            .find(|child| child.location.address == address)
617            .map(|child| child.name))
618    }
619
620    fn child_context(&self, child: &ChildEntry) -> String {
621        format!("child '{}' at {:#x}", child.name, child.location.address)
622    }
623
624    fn child_object_kind(&self, child: &ChildEntry) -> Result<ChildObjectKind> {
625        let header = self
626            .cached_child_header(child)
627            .map_err(|err| err.with_context(self.child_context(child)))?;
628
629        Ok(classify_child_header(header.as_ref()))
630    }
631
632    fn try_open_child_dataset(&self, child: &ChildEntry) -> Result<Option<Dataset>> {
633        let header = self
634            .cached_child_header(child)
635            .map_err(|err| err.with_context(self.child_context(child)))?;
636
637        if classify_child_header(header.as_ref()) != ChildObjectKind::Dataset {
638            return Ok(None);
639        }
640
641        Dataset::from_parsed_header(
642            crate::dataset::DatasetParseContext {
643                context: child.location.context.clone(),
644            },
645            child.location.address,
646            child.name.clone(),
647            header.as_ref(),
648        )
649        .map(Some)
650        .map_err(|err| err.with_context(self.child_context(child)))
651    }
652
653    fn cached_child_header(
654        &self,
655        child: &ChildEntry,
656    ) -> Result<Arc<crate::object_header::ObjectHeader>> {
657        child
658            .location
659            .context
660            .get_or_parse_header(child.location.address)
661    }
662
663    /// Maximum nesting depth for soft link resolution.
664    const MAX_SOFT_LINK_DEPTH: u32 = 16;
665
666    fn resolve_soft_link_depth(&self, path: &str, depth: u32) -> Result<ObjectLocation> {
667        self.resolve_path_location(path, depth, "soft link")
668    }
669
670    fn resolve_external_link_depth(
671        &self,
672        filename: &str,
673        path: &str,
674        depth: u32,
675    ) -> Result<Option<ObjectLocation>> {
676        if depth >= Self::MAX_SOFT_LINK_DEPTH {
677            return Err(Error::Other(format!(
678                "external link resolution exceeded maximum depth ({}) at '{}:{}'",
679                Self::MAX_SOFT_LINK_DEPTH,
680                filename,
681                path,
682            )));
683        }
684
685        let Some(resolver) = self.context.external_link_resolver.as_ref() else {
686            return Ok(None);
687        };
688        let Some(file) = resolver.resolve_external_link(filename)? else {
689            return Ok(None);
690        };
691        let root = file.root_group()?;
692        Ok(Some(root.resolve_path_location(
693            path,
694            depth + 1,
695            "external link",
696        )?))
697    }
698
699    fn resolve_path_location(
700        &self,
701        path: &str,
702        depth: u32,
703        link_kind: &str,
704    ) -> Result<ObjectLocation> {
705        if depth >= Self::MAX_SOFT_LINK_DEPTH {
706            return Err(Error::Other(format!(
707                "{} resolution exceeded maximum depth ({}) — possible cycle at '{}'",
708                link_kind,
709                Self::MAX_SOFT_LINK_DEPTH,
710                path,
711            )));
712        }
713
714        let parts: Vec<&str> = path
715            .trim_matches('/')
716            .split('/')
717            .filter(|s| !s.is_empty())
718            .collect();
719
720        if parts.is_empty() {
721            return Ok(self.local_location(self.root_address));
722        }
723
724        let start_addr = if path.starts_with('/') {
725            self.root_address
726        } else {
727            self.address
728        };
729
730        let mut current_group = Group::new(
731            self.context.clone(),
732            start_addr,
733            String::new(),
734            self.root_address,
735        );
736
737        for &part in &parts[..parts.len() - 1] {
738            current_group = current_group.group(part)?;
739        }
740
741        let target_name = parts[parts.len() - 1];
742        let children = current_group.resolve_children_with_link_depth(depth + 1)?;
743        for child in &children {
744            if child.name == target_name {
745                return Ok(child.location.clone());
746            }
747        }
748
749        Err(Error::Other(format!(
750            "{} target '{}' not found",
751            link_kind, path
752        )))
753    }
754}
755
756fn classify_child_header(header: &crate::object_header::ObjectHeader) -> ChildObjectKind {
757    let mut has_dataset_message = false;
758
759    for msg in &header.messages {
760        match msg {
761            HdfMessage::SymbolTable(_)
762            | HdfMessage::Link(_)
763            | HdfMessage::LinkInfo(_)
764            | HdfMessage::GroupInfo(_) => return ChildObjectKind::Group,
765            HdfMessage::Dataspace(_)
766            | HdfMessage::DataLayout(_)
767            | HdfMessage::FillValue(_)
768            | HdfMessage::FilterPipeline(_) => has_dataset_message = true,
769            _ => {}
770        }
771    }
772
773    if has_dataset_message {
774        ChildObjectKind::Dataset
775    } else {
776        ChildObjectKind::Other
777    }
778}