Skip to main content

hdf5_reader/
group.rs

1use std::sync::Arc;
2
3use crate::attribute_api::{
4    collect_attribute_messages_storage, resolve_vlen_bytes_storage, Attribute,
5};
6use crate::btree_v1;
7use crate::btree_v2;
8use crate::dataset::Dataset;
9use crate::error::{Error, Result};
10use crate::fractal_heap::{FractalHeap, FractalHeapDirectBlockCache};
11use crate::io::Cursor;
12use crate::local_heap::LocalHeap;
13use crate::messages::datatype::VarLenKind;
14use crate::messages::link::{self, LinkMessage, LinkTarget};
15use crate::messages::link_info::LinkInfoMessage;
16use crate::messages::symbol_table_msg::SymbolTableMessage;
17use crate::messages::HdfMessage;
18use crate::storage::Storage;
19use crate::FileContext;
20
21/// A group within an HDF5 file.
22#[derive(Clone)]
23pub struct Group {
24    context: Arc<FileContext>,
25    pub(crate) name: String,
26    pub(crate) address: u64,
27    /// Address of the root group's object header, used for resolving soft links.
28    pub(crate) root_address: u64,
29}
30
31#[derive(Clone)]
32struct ChildEntry {
33    name: String,
34    location: ObjectLocation,
35}
36
37#[derive(Clone)]
38struct ObjectLocation {
39    context: Arc<FileContext>,
40    address: u64,
41    root_address: u64,
42}
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45enum ChildObjectKind {
46    Group,
47    Dataset,
48    Other,
49}
50
51impl Group {
52    /// Create a group from a known object header address.
53    pub(crate) fn new(
54        context: Arc<FileContext>,
55        address: u64,
56        name: String,
57        root_address: u64,
58    ) -> Self {
59        Group {
60            context,
61            name,
62            address,
63            root_address,
64        }
65    }
66
67    /// Group name.
68    pub fn name(&self) -> &str {
69        &self.name
70    }
71
72    /// Object header address of this group within the file.
73    pub fn address(&self) -> u64 {
74        self.address
75    }
76
77    /// Materialize the full file backing this group.
78    pub fn file_data(&self) -> Result<crate::storage::StorageBuffer> {
79        self.context.full_file_data()
80    }
81
82    /// Access the underlying random-access storage backend.
83    pub fn storage(&self) -> &dyn Storage {
84        self.context.storage.as_ref()
85    }
86
87    /// Size of file offsets in bytes.
88    pub fn offset_size(&self) -> u8 {
89        self.context.superblock.offset_size
90    }
91
92    /// Size of file lengths in bytes.
93    pub fn length_size(&self) -> u8 {
94        self.context.superblock.length_size
95    }
96
97    /// Parse (or retrieve from cache) the object header at the given address.
98    fn cached_header(&self, addr: u64) -> Result<Arc<crate::object_header::ObjectHeader>> {
99        self.context.get_or_parse_header(addr)
100    }
101
102    fn local_location(&self, address: u64) -> ObjectLocation {
103        ObjectLocation {
104            context: self.context.clone(),
105            address,
106            root_address: self.root_address,
107        }
108    }
109
110    /// List all child groups.
111    pub fn groups(&self) -> Result<Vec<Group>> {
112        let (groups, _) = self.resolve_member_objects()?;
113        Ok(groups)
114    }
115
116    /// List all child members, partitioned into groups and datasets.
117    pub fn members(&self) -> Result<(Vec<Group>, Vec<Dataset>)> {
118        self.resolve_member_objects()
119    }
120
121    fn resolve_member_objects(&self) -> Result<(Vec<Group>, Vec<Dataset>)> {
122        let children = self.resolve_children()?;
123        let mut groups = Vec::new();
124        let mut datasets = Vec::new();
125        for child in &children {
126            match self.child_object_kind(child)? {
127                ChildObjectKind::Group => {
128                    groups.push(Group::new(
129                        child.location.context.clone(),
130                        child.location.address,
131                        child.name.clone(),
132                        child.location.root_address,
133                    ));
134                }
135                ChildObjectKind::Dataset => {
136                    if let Some(dataset) = self.try_open_child_dataset(child)? {
137                        datasets.push(dataset);
138                    }
139                }
140                ChildObjectKind::Other => {}
141            }
142        }
143        Ok((groups, datasets))
144    }
145
146    /// Get a child group by name.
147    pub fn group(&self, name: &str) -> Result<Group> {
148        let children = self.resolve_children()?;
149        for child in &children {
150            if child.name == name {
151                return match self.child_object_kind(child)? {
152                    ChildObjectKind::Group => Ok(Group::new(
153                        child.location.context.clone(),
154                        child.location.address,
155                        child.name.clone(),
156                        child.location.root_address,
157                    )),
158                    ChildObjectKind::Dataset => Err(Error::GroupNotFound(format!(
159                        "'{}' is a dataset, not a group",
160                        name
161                    ))),
162                    ChildObjectKind::Other => {
163                        Err(Error::GroupNotFound(format!("'{}' is not a group", name)))
164                    }
165                };
166            }
167        }
168        Err(Error::GroupNotFound(name.to_string()))
169    }
170
171    /// List all child datasets.
172    pub fn datasets(&self) -> Result<Vec<Dataset>> {
173        let (_, datasets) = self.resolve_member_objects()?;
174        Ok(datasets)
175    }
176
177    /// Get a child dataset by name.
178    pub fn dataset(&self, name: &str) -> Result<Dataset> {
179        let children = self.resolve_children()?;
180        for child in &children {
181            if child.name == name {
182                if let Some(dataset) = self.try_open_child_dataset(child)? {
183                    return Ok(dataset);
184                }
185                return Err(Error::DatasetNotFound(name.to_string()));
186            }
187        }
188        Err(Error::DatasetNotFound(name.to_string()))
189    }
190
191    /// List attributes on this group.
192    pub fn attributes(&self) -> Result<Vec<Attribute>> {
193        let mut header = (*self.cached_header(self.address)?).clone();
194        header.resolve_shared_messages_storage(
195            self.context.storage.as_ref(),
196            self.offset_size(),
197            self.length_size(),
198        )?;
199        Ok(collect_attribute_messages_storage(
200            &header,
201            self.context.storage.as_ref(),
202            self.offset_size(),
203            self.length_size(),
204            Some(self.context.filter_registry.as_ref()),
205        )?
206        .into_iter()
207        .map(|attr| {
208            let raw_data = match &attr.datatype {
209                crate::messages::datatype::Datatype::VarLen {
210                    base,
211                    kind: VarLenKind::String,
212                    ..
213                } if matches!(
214                    base.as_ref(),
215                    crate::messages::datatype::Datatype::FixedPoint { size: 1, .. }
216                ) && attr.dataspace.num_elements() == 1 =>
217                {
218                    resolve_vlen_bytes_storage(
219                        &attr.raw_data,
220                        self.context.storage.as_ref(),
221                        self.offset_size(),
222                        self.length_size(),
223                    )
224                    .unwrap_or_else(|| attr.raw_data.clone())
225                }
226                _ => attr.raw_data.clone(),
227            };
228            Attribute {
229                name: attr.name,
230                datatype: attr.datatype,
231                shape: match attr.dataspace.dataspace_type {
232                    crate::messages::dataspace::DataspaceType::Scalar => vec![],
233                    crate::messages::dataspace::DataspaceType::Null => vec![0],
234                    crate::messages::dataspace::DataspaceType::Simple => attr.dataspace.dims,
235                },
236                raw_data,
237            }
238        })
239        .collect())
240    }
241
242    /// Find an attribute by name.
243    pub fn attribute(&self, name: &str) -> Result<Attribute> {
244        let attrs = self.attributes()?;
245        attrs
246            .into_iter()
247            .find(|a| a.name == name)
248            .ok_or_else(|| Error::AttributeNotFound(name.to_string()))
249    }
250
251    /// Resolve children from the object header.
252    /// Handles both old-style (symbol table) and new-style (link messages) groups.
253    fn resolve_children(&self) -> Result<Vec<ChildEntry>> {
254        self.resolve_children_with_link_depth(0)
255    }
256
257    /// Resolve children with a soft-link depth counter to prevent cycles.
258    fn resolve_children_with_link_depth(&self, link_depth: u32) -> Result<Vec<ChildEntry>> {
259        let header = self.cached_header(self.address)?;
260
261        let mut children = Vec::new();
262
263        // Check for old-style groups (symbol table message)
264        let mut found_symbol_table = false;
265        // Check for new-style groups (link messages)
266        let mut link_info: Option<LinkInfoMessage> = None;
267        let mut links: Vec<LinkMessage> = Vec::new();
268
269        for msg in &header.messages {
270            match msg {
271                HdfMessage::SymbolTable(st) => {
272                    found_symbol_table = true;
273                    children = self.resolve_old_style_group_storage(st)?;
274                }
275                HdfMessage::Link(link) => {
276                    links.push(link.clone());
277                }
278                HdfMessage::LinkInfo(li) => {
279                    link_info = Some(li.clone());
280                }
281                _ => {}
282            }
283        }
284
285        if !found_symbol_table {
286            // New-style group: use compact links from header messages
287            self.resolve_link_targets(&links, link_depth, &mut children)?;
288
289            // Dense-link storage can coexist with compact links, so merge both.
290            if let Some(ref li) = link_info {
291                if !Cursor::is_undefined_offset(li.fractal_heap_address, self.offset_size()) {
292                    for child in self.resolve_dense_links_storage(li, link_depth)? {
293                        let is_duplicate = children.iter().any(|existing| {
294                            existing.name == child.name
295                                && existing.location.address == child.location.address
296                                && Arc::ptr_eq(&existing.location.context, &child.location.context)
297                        });
298                        if !is_duplicate {
299                            children.push(child);
300                        }
301                    }
302                }
303            }
304        }
305
306        Ok(children)
307    }
308
309    /// Resolve link targets (hard and soft), appending to `children`.
310    fn resolve_link_targets(
311        &self,
312        links: &[LinkMessage],
313        link_depth: u32,
314        children: &mut Vec<ChildEntry>,
315    ) -> Result<()> {
316        for link in links {
317            match &link.target {
318                LinkTarget::Hard { address } => {
319                    children.push(ChildEntry {
320                        name: link.name.clone(),
321                        location: self.local_location(*address),
322                    });
323                }
324                LinkTarget::Soft { path } => {
325                    if let Ok(location) = self.resolve_soft_link_depth(path, link_depth) {
326                        children.push(ChildEntry {
327                            name: link.name.clone(),
328                            location,
329                        });
330                    }
331                }
332                LinkTarget::External { filename, path } => {
333                    if let Some(location) =
334                        self.resolve_external_link_depth(filename, path, link_depth)?
335                    {
336                        children.push(ChildEntry {
337                            name: link.name.clone(),
338                            location,
339                        });
340                    }
341                }
342            }
343        }
344        Ok(())
345    }
346
347    fn resolve_old_style_group_storage(&self, st: &SymbolTableMessage) -> Result<Vec<ChildEntry>> {
348        let heap = LocalHeap::parse_at_storage(
349            self.context.storage.as_ref(),
350            st.heap_address,
351            self.offset_size(),
352            self.length_size(),
353        )?;
354
355        let leaves = btree_v1::collect_btree_v1_leaves_storage(
356            self.context.storage.as_ref(),
357            st.btree_address,
358            self.offset_size(),
359            self.length_size(),
360            None,
361            &[],
362            None,
363        )?;
364
365        let mut children = Vec::new();
366        for (_key, snod_address) in &leaves {
367            let header_len = 8 + 2 * usize::from(self.offset_size());
368            let prefix = self.context.read_range(*snod_address, header_len)?;
369            let mut prefix_cursor = Cursor::new(prefix.as_ref());
370            let sig = prefix_cursor.read_bytes(4)?;
371            if sig != *b"SNOD" {
372                return Err(Error::InvalidData(format!(
373                    "expected SNOD signature at offset {:#x}",
374                    snod_address
375                )));
376            }
377            let version = prefix_cursor.read_u8()?;
378            if version != 1 {
379                return Err(Error::InvalidData(format!(
380                    "unsupported symbol table node version {}",
381                    version
382                )));
383            }
384            prefix_cursor.skip(1)?;
385            let num_symbols = prefix_cursor.read_u16_le()?;
386            let node_len =
387                8 + usize::from(num_symbols) * (2 * usize::from(self.offset_size()) + 4 + 4 + 16);
388            let bytes = self.context.read_range(*snod_address, node_len)?;
389            let mut cursor = Cursor::new(bytes.as_ref());
390            let snod = crate::symbol_table::SymbolTableNode::parse(
391                &mut cursor,
392                self.offset_size(),
393                self.length_size(),
394            )?;
395
396            for entry in &snod.entries {
397                let name =
398                    heap.get_string_storage(entry.link_name_offset, self.context.storage.as_ref())?;
399                children.push(ChildEntry {
400                    name,
401                    location: self.local_location(entry.object_header_address),
402                });
403            }
404        }
405
406        Ok(children)
407    }
408
409    fn resolve_dense_links_storage(
410        &self,
411        link_info: &LinkInfoMessage,
412        link_depth: u32,
413    ) -> Result<Vec<ChildEntry>> {
414        let heap = FractalHeap::parse_at_storage(
415            self.context.storage.as_ref(),
416            link_info.fractal_heap_address,
417            self.offset_size(),
418            self.length_size(),
419        )?;
420
421        let btree_header = btree_v2::BTreeV2Header::parse_at_storage(
422            self.context.storage.as_ref(),
423            link_info.btree_name_index_address,
424            self.offset_size(),
425            self.length_size(),
426        )?;
427
428        let records = btree_v2::collect_btree_v2_records_storage(
429            self.context.storage.as_ref(),
430            &btree_header,
431            self.offset_size(),
432            self.length_size(),
433            None,
434            &[],
435            None,
436        )?;
437
438        let mut children = Vec::new();
439        let mut direct_block_cache = FractalHeapDirectBlockCache::default();
440        for record in &records {
441            let heap_id = match record {
442                btree_v2::BTreeV2Record::LinkNameHash { heap_id, .. }
443                | btree_v2::BTreeV2Record::CreationOrder { heap_id, .. } => heap_id,
444                _ => continue,
445            };
446
447            let managed_bytes = heap.get_object_storage_cached_with_registry(
448                heap_id,
449                self.context.storage.as_ref(),
450                self.offset_size(),
451                self.length_size(),
452                &mut direct_block_cache,
453                Some(self.context.filter_registry.as_ref()),
454            )?;
455
456            let mut link_cursor = Cursor::new(&managed_bytes);
457            let link_msg = link::parse(
458                &mut link_cursor,
459                self.offset_size(),
460                self.length_size(),
461                managed_bytes.len(),
462            )?;
463
464            match &link_msg.target {
465                LinkTarget::Hard { address } => {
466                    children.push(ChildEntry {
467                        name: link_msg.name.clone(),
468                        location: self.local_location(*address),
469                    });
470                }
471                LinkTarget::Soft { path } => {
472                    if let Ok(location) = self.resolve_soft_link_depth(path, link_depth) {
473                        children.push(ChildEntry {
474                            name: link_msg.name.clone(),
475                            location,
476                        });
477                    }
478                }
479                LinkTarget::External { filename, path } => {
480                    if let Some(location) =
481                        self.resolve_external_link_depth(filename, path, link_depth)?
482                    {
483                        children.push(ChildEntry {
484                            name: link_msg.name.clone(),
485                            location,
486                        });
487                    }
488                }
489            }
490        }
491
492        Ok(children)
493    }
494
495    pub fn child_name_by_address(&self, address: u64) -> Result<Option<String>> {
496        Ok(self
497            .resolve_children()?
498            .into_iter()
499            .find(|child| child.location.address == address)
500            .map(|child| child.name))
501    }
502
503    fn child_context(&self, child: &ChildEntry) -> String {
504        format!("child '{}' at {:#x}", child.name, child.location.address)
505    }
506
507    fn child_object_kind(&self, child: &ChildEntry) -> Result<ChildObjectKind> {
508        let header = self
509            .cached_child_header(child)
510            .map_err(|err| err.with_context(self.child_context(child)))?;
511
512        Ok(classify_child_header(header.as_ref()))
513    }
514
515    fn try_open_child_dataset(&self, child: &ChildEntry) -> Result<Option<Dataset>> {
516        let header = self
517            .cached_child_header(child)
518            .map_err(|err| err.with_context(self.child_context(child)))?;
519
520        if classify_child_header(header.as_ref()) != ChildObjectKind::Dataset {
521            return Ok(None);
522        }
523
524        Dataset::from_parsed_header(
525            crate::dataset::DatasetParseContext {
526                context: child.location.context.clone(),
527            },
528            child.location.address,
529            child.name.clone(),
530            header.as_ref(),
531        )
532        .map(Some)
533        .map_err(|err| err.with_context(self.child_context(child)))
534    }
535
536    fn cached_child_header(
537        &self,
538        child: &ChildEntry,
539    ) -> Result<Arc<crate::object_header::ObjectHeader>> {
540        child
541            .location
542            .context
543            .get_or_parse_header(child.location.address)
544    }
545
546    /// Maximum nesting depth for soft link resolution.
547    const MAX_SOFT_LINK_DEPTH: u32 = 16;
548
549    fn resolve_soft_link_depth(&self, path: &str, depth: u32) -> Result<ObjectLocation> {
550        self.resolve_path_location(path, depth, "soft link")
551    }
552
553    fn resolve_external_link_depth(
554        &self,
555        filename: &str,
556        path: &str,
557        depth: u32,
558    ) -> Result<Option<ObjectLocation>> {
559        if depth >= Self::MAX_SOFT_LINK_DEPTH {
560            return Err(Error::Other(format!(
561                "external link resolution exceeded maximum depth ({}) at '{}:{}'",
562                Self::MAX_SOFT_LINK_DEPTH,
563                filename,
564                path,
565            )));
566        }
567
568        let Some(resolver) = self.context.external_link_resolver.as_ref() else {
569            return Ok(None);
570        };
571        let Some(file) = resolver.resolve_external_link(filename)? else {
572            return Ok(None);
573        };
574        let root = file.root_group()?;
575        Ok(Some(root.resolve_path_location(
576            path,
577            depth + 1,
578            "external link",
579        )?))
580    }
581
582    fn resolve_path_location(
583        &self,
584        path: &str,
585        depth: u32,
586        link_kind: &str,
587    ) -> Result<ObjectLocation> {
588        if depth >= Self::MAX_SOFT_LINK_DEPTH {
589            return Err(Error::Other(format!(
590                "{} resolution exceeded maximum depth ({}) — possible cycle at '{}'",
591                link_kind,
592                Self::MAX_SOFT_LINK_DEPTH,
593                path,
594            )));
595        }
596
597        let parts: Vec<&str> = path
598            .trim_matches('/')
599            .split('/')
600            .filter(|s| !s.is_empty())
601            .collect();
602
603        if parts.is_empty() {
604            return Ok(self.local_location(self.root_address));
605        }
606
607        let start_addr = if path.starts_with('/') {
608            self.root_address
609        } else {
610            self.address
611        };
612
613        let mut current_group = Group::new(
614            self.context.clone(),
615            start_addr,
616            String::new(),
617            self.root_address,
618        );
619
620        for &part in &parts[..parts.len() - 1] {
621            current_group = current_group.group(part)?;
622        }
623
624        let target_name = parts[parts.len() - 1];
625        let children = current_group.resolve_children_with_link_depth(depth + 1)?;
626        for child in &children {
627            if child.name == target_name {
628                return Ok(child.location.clone());
629            }
630        }
631
632        Err(Error::Other(format!(
633            "{} target '{}' not found",
634            link_kind, path
635        )))
636    }
637}
638
639fn classify_child_header(header: &crate::object_header::ObjectHeader) -> ChildObjectKind {
640    let mut has_dataset_message = false;
641
642    for msg in &header.messages {
643        match msg {
644            HdfMessage::SymbolTable(_)
645            | HdfMessage::Link(_)
646            | HdfMessage::LinkInfo(_)
647            | HdfMessage::GroupInfo(_) => return ChildObjectKind::Group,
648            HdfMessage::Dataspace(_)
649            | HdfMessage::DataLayout(_)
650            | HdfMessage::FillValue(_)
651            | HdfMessage::FilterPipeline(_) => has_dataset_message = true,
652            _ => {}
653        }
654    }
655
656    if has_dataset_message {
657        ChildObjectKind::Dataset
658    } else {
659        ChildObjectKind::Other
660    }
661}