Skip to main content

hdf5_reader/
group.rs

1use std::sync::Arc;
2
3use crate::attribute_api::{
4    collect_attribute_messages_storage, resolve_vlen_bytes_storage, Attribute,
5};
6use crate::btree_v1;
7use crate::btree_v2;
8use crate::dataset::Dataset;
9use crate::error::{Error, Result};
10use crate::fractal_heap::FractalHeap;
11use crate::io::Cursor;
12use crate::local_heap::LocalHeap;
13use crate::messages::link::{self, LinkMessage, LinkTarget};
14use crate::messages::link_info::LinkInfoMessage;
15use crate::messages::symbol_table_msg::SymbolTableMessage;
16use crate::messages::HdfMessage;
17use crate::storage::Storage;
18use crate::FileContext;
19
20/// A group within an HDF5 file.
21#[derive(Clone)]
22pub struct Group {
23    context: Arc<FileContext>,
24    pub(crate) name: String,
25    pub(crate) address: u64,
26    /// Address of the root group's object header, used for resolving soft links.
27    pub(crate) root_address: u64,
28}
29
30#[derive(Debug, Clone)]
31struct ChildEntry {
32    name: String,
33    address: u64,
34}
35
36impl Group {
37    /// Create a group from a known object header address.
38    pub(crate) fn new(
39        context: Arc<FileContext>,
40        address: u64,
41        name: String,
42        root_address: u64,
43    ) -> Self {
44        Group {
45            context,
46            name,
47            address,
48            root_address,
49        }
50    }
51
52    /// Group name.
53    pub fn name(&self) -> &str {
54        &self.name
55    }
56
57    /// Object header address of this group within the file.
58    pub fn address(&self) -> u64 {
59        self.address
60    }
61
62    /// Materialize the full file backing this group.
63    pub fn file_data(&self) -> Result<crate::storage::StorageBuffer> {
64        self.context.full_file_data()
65    }
66
67    /// Access the underlying random-access storage backend.
68    pub fn storage(&self) -> &dyn Storage {
69        self.context.storage.as_ref()
70    }
71
72    /// Size of file offsets in bytes.
73    pub fn offset_size(&self) -> u8 {
74        self.context.superblock.offset_size
75    }
76
77    /// Size of file lengths in bytes.
78    pub fn length_size(&self) -> u8 {
79        self.context.superblock.length_size
80    }
81
82    /// Parse (or retrieve from cache) the object header at the given address.
83    fn cached_header(&self, addr: u64) -> Result<Arc<crate::object_header::ObjectHeader>> {
84        self.context.get_or_parse_header(addr)
85    }
86
87    /// List all child groups.
88    pub fn groups(&self) -> Result<Vec<Group>> {
89        let (groups, _) = self.resolve_member_objects()?;
90        Ok(groups)
91    }
92
93    /// List all child members, partitioned into groups and datasets.
94    pub fn members(&self) -> Result<(Vec<Group>, Vec<Dataset>)> {
95        self.resolve_member_objects()
96    }
97
98    fn resolve_member_objects(&self) -> Result<(Vec<Group>, Vec<Dataset>)> {
99        let children = self.resolve_children()?;
100        let mut groups = Vec::new();
101        let mut datasets = Vec::new();
102        for child in &children {
103            if self.child_is_group(child)? {
104                groups.push(Group::new(
105                    self.context.clone(),
106                    child.address,
107                    child.name.clone(),
108                    self.root_address,
109                ));
110            } else if let Some(dataset) = self.try_open_child_dataset(child) {
111                datasets.push(dataset);
112            }
113        }
114        Ok((groups, datasets))
115    }
116
117    /// Get a child group by name.
118    pub fn group(&self, name: &str) -> Result<Group> {
119        let children = self.resolve_children()?;
120        for child in &children {
121            if child.name == name {
122                if self.is_group_at(child.address)? {
123                    return Ok(Group::new(
124                        self.context.clone(),
125                        child.address,
126                        child.name.clone(),
127                        self.root_address,
128                    ));
129                } else {
130                    return Err(Error::GroupNotFound(format!(
131                        "'{}' is a dataset, not a group",
132                        name
133                    )));
134                }
135            }
136        }
137        Err(Error::GroupNotFound(name.to_string()))
138    }
139
140    /// List all child datasets.
141    pub fn datasets(&self) -> Result<Vec<Dataset>> {
142        let (_, datasets) = self.resolve_member_objects()?;
143        Ok(datasets)
144    }
145
146    /// Get a child dataset by name.
147    pub fn dataset(&self, name: &str) -> Result<Dataset> {
148        let children = self.resolve_children()?;
149        for child in &children {
150            if child.name == name {
151                if let Some(dataset) = self.try_open_child_dataset(child) {
152                    return Ok(dataset);
153                }
154                return Err(Error::DatasetNotFound(name.to_string()));
155            }
156        }
157        Err(Error::DatasetNotFound(name.to_string()))
158    }
159
160    /// List attributes on this group.
161    pub fn attributes(&self) -> Result<Vec<Attribute>> {
162        let mut header = (*self.cached_header(self.address)?).clone();
163        header.resolve_shared_messages_storage(
164            self.context.storage.as_ref(),
165            self.offset_size(),
166            self.length_size(),
167        )?;
168        Ok(collect_attribute_messages_storage(
169            &header,
170            self.context.storage.as_ref(),
171            self.offset_size(),
172            self.length_size(),
173        )?
174        .into_iter()
175        .map(|attr| {
176            let raw_data = match &attr.datatype {
177                crate::messages::datatype::Datatype::VarLen { base }
178                    if matches!(
179                        base.as_ref(),
180                        crate::messages::datatype::Datatype::FixedPoint { size: 1, .. }
181                    ) && attr.dataspace.num_elements() == 1 =>
182                {
183                    resolve_vlen_bytes_storage(
184                        &attr.raw_data,
185                        self.context.storage.as_ref(),
186                        self.offset_size(),
187                        self.length_size(),
188                    )
189                    .unwrap_or_else(|| attr.raw_data.clone())
190                }
191                _ => attr.raw_data.clone(),
192            };
193            Attribute {
194                name: attr.name,
195                datatype: attr.datatype,
196                shape: match attr.dataspace.dataspace_type {
197                    crate::messages::dataspace::DataspaceType::Scalar => vec![],
198                    crate::messages::dataspace::DataspaceType::Null => vec![0],
199                    crate::messages::dataspace::DataspaceType::Simple => attr.dataspace.dims,
200                },
201                raw_data,
202            }
203        })
204        .collect())
205    }
206
207    /// Find an attribute by name.
208    pub fn attribute(&self, name: &str) -> Result<Attribute> {
209        let attrs = self.attributes()?;
210        attrs
211            .into_iter()
212            .find(|a| a.name == name)
213            .ok_or_else(|| Error::AttributeNotFound(name.to_string()))
214    }
215
216    /// Resolve children from the object header.
217    /// Handles both old-style (symbol table) and new-style (link messages) groups.
218    fn resolve_children(&self) -> Result<Vec<ChildEntry>> {
219        self.resolve_children_with_link_depth(0)
220    }
221
222    /// Resolve children with a soft-link depth counter to prevent cycles.
223    fn resolve_children_with_link_depth(&self, link_depth: u32) -> Result<Vec<ChildEntry>> {
224        let header = self.cached_header(self.address)?;
225
226        let mut children = Vec::new();
227
228        // Check for old-style groups (symbol table message)
229        let mut found_symbol_table = false;
230        // Check for new-style groups (link messages)
231        let mut link_info: Option<LinkInfoMessage> = None;
232        let mut links: Vec<LinkMessage> = Vec::new();
233
234        for msg in &header.messages {
235            match msg {
236                HdfMessage::SymbolTable(st) => {
237                    found_symbol_table = true;
238                    children = self.resolve_old_style_group_storage(st)?;
239                }
240                HdfMessage::Link(link) => {
241                    links.push(link.clone());
242                }
243                HdfMessage::LinkInfo(li) => {
244                    link_info = Some(li.clone());
245                }
246                _ => {}
247            }
248        }
249
250        if !found_symbol_table {
251            // New-style group: use compact links from header messages
252            self.resolve_link_targets(&links, link_depth, &mut children);
253
254            // Dense-link storage can coexist with compact links, so merge both.
255            if let Some(ref li) = link_info {
256                if !Cursor::is_undefined_offset(li.fractal_heap_address, self.offset_size()) {
257                    for child in self.resolve_dense_links_storage(li, link_depth)? {
258                        let is_duplicate = children.iter().any(|existing| {
259                            existing.name == child.name && existing.address == child.address
260                        });
261                        if !is_duplicate {
262                            children.push(child);
263                        }
264                    }
265                }
266            }
267        }
268
269        Ok(children)
270    }
271
272    /// Resolve link targets (hard and soft), appending to `children`.
273    fn resolve_link_targets(
274        &self,
275        links: &[LinkMessage],
276        link_depth: u32,
277        children: &mut Vec<ChildEntry>,
278    ) {
279        for link in links {
280            match &link.target {
281                LinkTarget::Hard { address } => {
282                    children.push(ChildEntry {
283                        name: link.name.clone(),
284                        address: *address,
285                    });
286                }
287                LinkTarget::Soft { path } => {
288                    if let Ok(address) = self.resolve_soft_link_depth(path, link_depth) {
289                        children.push(ChildEntry {
290                            name: link.name.clone(),
291                            address,
292                        });
293                    }
294                }
295                LinkTarget::External { .. } => {
296                    // External links reference other files; skip.
297                }
298            }
299        }
300    }
301
302    /// Resolve old-style group children via B-tree v1 + local heap.
303    #[allow(dead_code)]
304    fn resolve_old_style_group(
305        &self,
306        st: &SymbolTableMessage,
307        file_data: &[u8],
308    ) -> Result<Vec<ChildEntry>> {
309        let mut heap_cursor = Cursor::new(file_data);
310        heap_cursor.set_position(st.heap_address);
311        let heap = LocalHeap::parse(&mut heap_cursor, self.offset_size(), self.length_size())?;
312
313        let leaves = btree_v1::collect_btree_v1_leaves(
314            file_data,
315            st.btree_address,
316            self.offset_size(),
317            self.length_size(),
318            None,
319            &[],
320            None,
321        )?;
322
323        let mut children = Vec::new();
324        for (_key, snod_address) in &leaves {
325            let mut cursor = Cursor::new(file_data);
326            cursor.set_position(*snod_address);
327            let snod = crate::symbol_table::SymbolTableNode::parse(
328                &mut cursor,
329                self.offset_size(),
330                self.length_size(),
331            )?;
332
333            for entry in &snod.entries {
334                let name = heap.get_string(entry.link_name_offset, file_data)?;
335                children.push(ChildEntry {
336                    name,
337                    address: entry.object_header_address,
338                });
339            }
340        }
341
342        Ok(children)
343    }
344
345    fn resolve_old_style_group_storage(&self, st: &SymbolTableMessage) -> Result<Vec<ChildEntry>> {
346        let heap = LocalHeap::parse_at_storage(
347            self.context.storage.as_ref(),
348            st.heap_address,
349            self.offset_size(),
350            self.length_size(),
351        )?;
352
353        let leaves = btree_v1::collect_btree_v1_leaves_storage(
354            self.context.storage.as_ref(),
355            st.btree_address,
356            self.offset_size(),
357            self.length_size(),
358            None,
359            &[],
360            None,
361        )?;
362
363        let mut children = Vec::new();
364        for (_key, snod_address) in &leaves {
365            let header_len = 8 + 2 * usize::from(self.offset_size());
366            let prefix = self.context.read_range(*snod_address, header_len)?;
367            let mut prefix_cursor = Cursor::new(prefix.as_ref());
368            let sig = prefix_cursor.read_bytes(4)?;
369            if sig != *b"SNOD" {
370                return Err(Error::InvalidData(format!(
371                    "expected SNOD signature at offset {:#x}",
372                    snod_address
373                )));
374            }
375            let version = prefix_cursor.read_u8()?;
376            if version != 1 {
377                return Err(Error::InvalidData(format!(
378                    "unsupported symbol table node version {}",
379                    version
380                )));
381            }
382            prefix_cursor.skip(1)?;
383            let num_symbols = prefix_cursor.read_u16_le()?;
384            let node_len =
385                8 + usize::from(num_symbols) * (2 * usize::from(self.offset_size()) + 4 + 4 + 16);
386            let bytes = self.context.read_range(*snod_address, node_len)?;
387            let mut cursor = Cursor::new(bytes.as_ref());
388            let snod = crate::symbol_table::SymbolTableNode::parse(
389                &mut cursor,
390                self.offset_size(),
391                self.length_size(),
392            )?;
393
394            for entry in &snod.entries {
395                let name =
396                    heap.get_string_storage(entry.link_name_offset, self.context.storage.as_ref())?;
397                children.push(ChildEntry {
398                    name,
399                    address: entry.object_header_address,
400                });
401            }
402        }
403
404        Ok(children)
405    }
406
407    /// Resolve dense links from a fractal heap + B-tree v2.
408    #[allow(dead_code)]
409    fn resolve_dense_links(
410        &self,
411        link_info: &LinkInfoMessage,
412        link_depth: u32,
413        file_data: &[u8],
414    ) -> Result<Vec<ChildEntry>> {
415        let mut heap_cursor = Cursor::new(file_data);
416        heap_cursor.set_position(link_info.fractal_heap_address);
417        let heap = FractalHeap::parse(&mut heap_cursor, self.offset_size(), self.length_size())?;
418
419        let mut btree_cursor = Cursor::new(file_data);
420        btree_cursor.set_position(link_info.btree_name_index_address);
421        let btree_header = btree_v2::BTreeV2Header::parse(
422            &mut btree_cursor,
423            self.offset_size(),
424            self.length_size(),
425        )?;
426
427        let records = btree_v2::collect_btree_v2_records(
428            file_data,
429            &btree_header,
430            self.offset_size(),
431            self.length_size(),
432            None,
433            &[],
434            None,
435        )?;
436
437        let mut children = Vec::new();
438        for record in &records {
439            let heap_id = match record {
440                btree_v2::BTreeV2Record::LinkNameHash { heap_id, .. } => heap_id,
441                btree_v2::BTreeV2Record::CreationOrder { heap_id, .. } => heap_id,
442                _ => continue,
443            };
444
445            let managed_bytes = heap.get_managed_object(
446                heap_id,
447                file_data,
448                self.offset_size(),
449                self.length_size(),
450            )?;
451
452            let mut link_cursor = Cursor::new(&managed_bytes);
453            let link_msg = link::parse(
454                &mut link_cursor,
455                self.offset_size(),
456                self.length_size(),
457                managed_bytes.len(),
458            )?;
459
460            match &link_msg.target {
461                LinkTarget::Hard { address } => {
462                    children.push(ChildEntry {
463                        name: link_msg.name.clone(),
464                        address: *address,
465                    });
466                }
467                LinkTarget::Soft { path } => {
468                    if let Ok(address) = self.resolve_soft_link_depth(path, link_depth) {
469                        children.push(ChildEntry {
470                            name: link_msg.name.clone(),
471                            address,
472                        });
473                    }
474                }
475                LinkTarget::External { .. } => {}
476            }
477        }
478
479        Ok(children)
480    }
481
482    fn resolve_dense_links_storage(
483        &self,
484        link_info: &LinkInfoMessage,
485        link_depth: u32,
486    ) -> Result<Vec<ChildEntry>> {
487        let heap = FractalHeap::parse_at_storage(
488            self.context.storage.as_ref(),
489            link_info.fractal_heap_address,
490            self.offset_size(),
491            self.length_size(),
492        )?;
493
494        let btree_header = btree_v2::BTreeV2Header::parse_at_storage(
495            self.context.storage.as_ref(),
496            link_info.btree_name_index_address,
497            self.offset_size(),
498            self.length_size(),
499        )?;
500
501        let records = btree_v2::collect_btree_v2_records_storage(
502            self.context.storage.as_ref(),
503            &btree_header,
504            self.offset_size(),
505            self.length_size(),
506            None,
507            &[],
508            None,
509        )?;
510
511        let mut children = Vec::new();
512        for record in &records {
513            let heap_id = match record {
514                btree_v2::BTreeV2Record::LinkNameHash { heap_id, .. }
515                | btree_v2::BTreeV2Record::CreationOrder { heap_id, .. } => heap_id,
516                _ => continue,
517            };
518
519            let managed_bytes = heap.get_managed_object_storage(
520                heap_id,
521                self.context.storage.as_ref(),
522                self.offset_size(),
523                self.length_size(),
524            )?;
525
526            let mut link_cursor = Cursor::new(&managed_bytes);
527            let link_msg = link::parse(
528                &mut link_cursor,
529                self.offset_size(),
530                self.length_size(),
531                managed_bytes.len(),
532            )?;
533
534            match &link_msg.target {
535                LinkTarget::Hard { address } => {
536                    children.push(ChildEntry {
537                        name: link_msg.name.clone(),
538                        address: *address,
539                    });
540                }
541                LinkTarget::Soft { path } => {
542                    if let Ok(address) = self.resolve_soft_link_depth(path, link_depth) {
543                        children.push(ChildEntry {
544                            name: link_msg.name.clone(),
545                            address,
546                        });
547                    }
548                }
549                LinkTarget::External { .. } => {}
550            }
551        }
552
553        Ok(children)
554    }
555
556    pub fn child_name_by_address(&self, address: u64) -> Result<Option<String>> {
557        Ok(self
558            .resolve_children()?
559            .into_iter()
560            .find(|child| child.address == address)
561            .map(|child| child.name))
562    }
563
564    /// Check if the object at the given address is a group (vs a dataset).
565    fn is_group_at(&self, address: u64) -> Result<bool> {
566        let mut header = (*self.cached_header(address)?).clone();
567        header.resolve_shared_messages_storage(
568            self.context.storage.as_ref(),
569            self.offset_size(),
570            self.length_size(),
571        )?;
572        for msg in &header.messages {
573            match msg {
574                HdfMessage::SymbolTable(_)
575                | HdfMessage::Link(_)
576                | HdfMessage::LinkInfo(_)
577                | HdfMessage::GroupInfo(_) => return Ok(true),
578                HdfMessage::DataLayout(_) => return Ok(false),
579                _ => {}
580            }
581        }
582        Ok(true)
583    }
584
585    fn try_open_child_dataset(&self, child: &ChildEntry) -> Option<Dataset> {
586        let header = self.cached_header(child.address).ok()?;
587        Dataset::from_parsed_header(
588            crate::dataset::DatasetParseContext {
589                context: self.context.clone(),
590            },
591            child.address,
592            child.name.clone(),
593            header.as_ref(),
594        )
595        .ok()
596    }
597
598    fn child_is_group(&self, child: &ChildEntry) -> Result<bool> {
599        match self.is_group_at(child.address) {
600            Ok(is_group) => Ok(is_group),
601            Err(_) => Ok(self.try_open_child_dataset(child).is_none()),
602        }
603    }
604
605    /// Maximum nesting depth for soft link resolution.
606    const MAX_SOFT_LINK_DEPTH: u32 = 16;
607
608    fn resolve_soft_link_depth(&self, path: &str, depth: u32) -> Result<u64> {
609        if depth >= Self::MAX_SOFT_LINK_DEPTH {
610            return Err(Error::Other(format!(
611                "soft link resolution exceeded maximum depth ({}) — possible cycle at '{}'",
612                Self::MAX_SOFT_LINK_DEPTH,
613                path,
614            )));
615        }
616
617        let parts: Vec<&str> = path
618            .trim_matches('/')
619            .split('/')
620            .filter(|s| !s.is_empty())
621            .collect();
622
623        if parts.is_empty() {
624            return Ok(self.root_address);
625        }
626
627        let start_addr = if path.starts_with('/') {
628            self.root_address
629        } else {
630            self.address
631        };
632
633        let mut current_group = Group::new(
634            self.context.clone(),
635            start_addr,
636            String::new(),
637            self.root_address,
638        );
639
640        for &part in &parts[..parts.len() - 1] {
641            current_group = current_group.group(part)?;
642        }
643
644        let target_name = parts[parts.len() - 1];
645        let children = current_group.resolve_children_with_link_depth(depth + 1)?;
646        for child in &children {
647            if child.name == target_name {
648                return Ok(child.address);
649            }
650        }
651
652        Err(Error::Other(format!(
653            "soft link target '{}' not found",
654            path
655        )))
656    }
657}