Skip to main content

hdf5_reader/
group.rs

1use std::collections::HashMap;
2use std::sync::Arc;
3
4use parking_lot::Mutex;
5
6use crate::attribute_api::{collect_attribute_messages, Attribute};
7use crate::btree_v1;
8use crate::btree_v2;
9use crate::cache::ChunkCache;
10use crate::dataset::Dataset;
11use crate::error::{Error, Result};
12use crate::filters::FilterRegistry;
13use crate::fractal_heap::FractalHeap;
14use crate::io::Cursor;
15use crate::local_heap::LocalHeap;
16use crate::messages::link::{self, LinkMessage, LinkTarget};
17use crate::messages::link_info::LinkInfoMessage;
18use crate::messages::symbol_table_msg::SymbolTableMessage;
19use crate::messages::HdfMessage;
20use crate::object_header::ObjectHeader;
21use crate::symbol_table::SymbolTableNode;
22
23/// A group within an HDF5 file.
24pub struct Group<'f> {
25    file_data: &'f [u8],
26    offset_size: u8,
27    length_size: u8,
28    pub(crate) name: String,
29    pub(crate) address: u64,
30    /// Address of the root group's object header, used for resolving soft links.
31    pub(crate) root_address: u64,
32    pub(crate) chunk_cache: Arc<ChunkCache>,
33    pub(crate) header_cache: Arc<Mutex<HashMap<u64, Arc<ObjectHeader>>>>,
34    pub(crate) filter_registry: Arc<FilterRegistry>,
35}
36
37#[derive(Debug, Clone)]
38struct ChildEntry {
39    name: String,
40    address: u64,
41}
42
43impl<'f> Group<'f> {
44    /// Create a group from a known object header address.
45    #[allow(clippy::too_many_arguments)]
46    pub(crate) fn new(
47        file_data: &'f [u8],
48        address: u64,
49        name: String,
50        offset_size: u8,
51        length_size: u8,
52        root_address: u64,
53        chunk_cache: Arc<ChunkCache>,
54        header_cache: Arc<Mutex<HashMap<u64, Arc<ObjectHeader>>>>,
55        filter_registry: Arc<FilterRegistry>,
56    ) -> Self {
57        Group {
58            file_data,
59            offset_size,
60            length_size,
61            name,
62            address,
63            root_address,
64            chunk_cache,
65            header_cache,
66            filter_registry,
67        }
68    }
69
70    /// Group name.
71    pub fn name(&self) -> &str {
72        &self.name
73    }
74
75    /// Object header address of this group within the file.
76    pub fn address(&self) -> u64 {
77        self.address
78    }
79
80    /// Access the raw file data backing this group.
81    pub fn file_data(&self) -> &'f [u8] {
82        self.file_data
83    }
84
85    /// Size of file offsets in bytes.
86    pub fn offset_size(&self) -> u8 {
87        self.offset_size
88    }
89
90    /// Size of file lengths in bytes.
91    pub fn length_size(&self) -> u8 {
92        self.length_size
93    }
94
95    /// Parse (or retrieve from cache) the object header at the given address.
96    fn cached_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
97        {
98            let cache = self.header_cache.lock();
99            if let Some(hdr) = cache.get(&addr) {
100                return Ok(Arc::clone(hdr));
101            }
102        }
103        let mut hdr =
104            ObjectHeader::parse_at(self.file_data, addr, self.offset_size, self.length_size)?;
105        hdr.resolve_shared_messages(self.file_data, self.offset_size, self.length_size)?;
106        let arc = Arc::new(hdr);
107        let mut cache = self.header_cache.lock();
108        cache.insert(addr, Arc::clone(&arc));
109        Ok(arc)
110    }
111
112    /// List all child groups.
113    pub fn groups(&self) -> Result<Vec<Group<'f>>> {
114        let (groups, _) = self.resolve_member_objects()?;
115        Ok(groups)
116    }
117
118    /// List all child members, partitioned into groups and datasets.
119    pub fn members(&self) -> Result<(Vec<Group<'f>>, Vec<Dataset<'f>>)> {
120        self.resolve_member_objects()
121    }
122
123    fn resolve_member_objects(&self) -> Result<(Vec<Group<'f>>, Vec<Dataset<'f>>)> {
124        let children = self.resolve_children()?;
125        let mut groups = Vec::new();
126        let mut datasets = Vec::new();
127        for child in &children {
128            if self.child_is_group(child)? {
129                groups.push(Group::new(
130                    self.file_data,
131                    child.address,
132                    child.name.clone(),
133                    self.offset_size,
134                    self.length_size,
135                    self.root_address,
136                    self.chunk_cache.clone(),
137                    self.header_cache.clone(),
138                    self.filter_registry.clone(),
139                ));
140            } else if let Some(dataset) = self.try_open_child_dataset(child) {
141                datasets.push(dataset);
142            }
143        }
144        Ok((groups, datasets))
145    }
146
147    /// Get a child group by name.
148    pub fn group(&self, name: &str) -> Result<Group<'f>> {
149        let children = self.resolve_children()?;
150        for child in &children {
151            if child.name == name {
152                if self.is_group_at(child.address)? {
153                    return Ok(Group::new(
154                        self.file_data,
155                        child.address,
156                        child.name.clone(),
157                        self.offset_size,
158                        self.length_size,
159                        self.root_address,
160                        self.chunk_cache.clone(),
161                        self.header_cache.clone(),
162                        self.filter_registry.clone(),
163                    ));
164                } else {
165                    return Err(Error::GroupNotFound(format!(
166                        "'{}' is a dataset, not a group",
167                        name
168                    )));
169                }
170            }
171        }
172        Err(Error::GroupNotFound(name.to_string()))
173    }
174
175    /// List all child datasets.
176    pub fn datasets(&self) -> Result<Vec<Dataset<'f>>> {
177        let (_, datasets) = self.resolve_member_objects()?;
178        Ok(datasets)
179    }
180
181    /// Get a child dataset by name.
182    pub fn dataset(&self, name: &str) -> Result<Dataset<'f>> {
183        let children = self.resolve_children()?;
184        for child in &children {
185            if child.name == name {
186                if let Some(dataset) = self.try_open_child_dataset(child) {
187                    return Ok(dataset);
188                }
189                return Err(Error::DatasetNotFound(name.to_string()));
190            }
191        }
192        Err(Error::DatasetNotFound(name.to_string()))
193    }
194
195    /// List attributes on this group.
196    pub fn attributes(&self) -> Result<Vec<Attribute>> {
197        let mut header = (*self.cached_header(self.address)?).clone();
198        header.resolve_shared_messages(self.file_data, self.offset_size, self.length_size)?;
199        Ok(
200            collect_attribute_messages(
201                &header,
202                self.file_data,
203                self.offset_size,
204                self.length_size,
205            )?
206            .into_iter()
207            .map(|attr| {
208                Attribute::from_message_with_context(attr, Some(self.file_data), self.offset_size)
209            })
210            .collect(),
211        )
212    }
213
214    /// Find an attribute by name.
215    pub fn attribute(&self, name: &str) -> Result<Attribute> {
216        let attrs = self.attributes()?;
217        attrs
218            .into_iter()
219            .find(|a| a.name == name)
220            .ok_or_else(|| Error::AttributeNotFound(name.to_string()))
221    }
222
223    /// Resolve children from the object header.
224    /// Handles both old-style (symbol table) and new-style (link messages) groups.
225    fn resolve_children(&self) -> Result<Vec<ChildEntry>> {
226        self.resolve_children_with_link_depth(0)
227    }
228
229    /// Resolve children with a soft-link depth counter to prevent cycles.
230    fn resolve_children_with_link_depth(&self, link_depth: u32) -> Result<Vec<ChildEntry>> {
231        let header = self.cached_header(self.address)?;
232
233        let mut children = Vec::new();
234
235        // Check for old-style groups (symbol table message)
236        let mut found_symbol_table = false;
237        // Check for new-style groups (link messages)
238        let mut link_info: Option<LinkInfoMessage> = None;
239        let mut links: Vec<LinkMessage> = Vec::new();
240
241        for msg in &header.messages {
242            match msg {
243                HdfMessage::SymbolTable(st) => {
244                    found_symbol_table = true;
245                    children = self.resolve_old_style_group(st)?;
246                }
247                HdfMessage::Link(link) => {
248                    links.push(link.clone());
249                }
250                HdfMessage::LinkInfo(li) => {
251                    link_info = Some(li.clone());
252                }
253                _ => {}
254            }
255        }
256
257        if !found_symbol_table {
258            // New-style group: use compact links from header messages
259            Self::resolve_link_targets(self, &links, link_depth, &mut children);
260
261            // Dense-link storage can coexist with compact links, so merge both.
262            if let Some(ref li) = link_info {
263                if !Cursor::is_undefined_offset(li.fractal_heap_address, self.offset_size) {
264                    for child in self.resolve_dense_links(li, link_depth)? {
265                        let is_duplicate = children.iter().any(|existing| {
266                            existing.name == child.name && existing.address == child.address
267                        });
268                        if !is_duplicate {
269                            children.push(child);
270                        }
271                    }
272                }
273            }
274        }
275
276        Ok(children)
277    }
278
279    /// Resolve link targets (hard and soft), appending to `children`.
280    fn resolve_link_targets(
281        &self,
282        links: &[LinkMessage],
283        link_depth: u32,
284        children: &mut Vec<ChildEntry>,
285    ) {
286        for link in links {
287            match &link.target {
288                LinkTarget::Hard { address } => {
289                    children.push(ChildEntry {
290                        name: link.name.clone(),
291                        address: *address,
292                    });
293                }
294                LinkTarget::Soft { path } => {
295                    if let Ok(address) = self.resolve_soft_link_depth(path, link_depth) {
296                        children.push(ChildEntry {
297                            name: link.name.clone(),
298                            address,
299                        });
300                    }
301                }
302                LinkTarget::External { .. } => {
303                    // External links reference other files; skip.
304                }
305            }
306        }
307    }
308
309    /// Resolve old-style group children via B-tree v1 + local heap.
310    fn resolve_old_style_group(&self, st: &SymbolTableMessage) -> Result<Vec<ChildEntry>> {
311        // Parse the local heap to get the name table
312        let mut heap_cursor = Cursor::new(self.file_data);
313        heap_cursor.set_position(st.heap_address);
314        let heap = LocalHeap::parse(&mut heap_cursor, self.offset_size, self.length_size)?;
315
316        // Walk the B-tree to collect all symbol table node addresses
317        let leaves = btree_v1::collect_btree_v1_leaves(
318            self.file_data,
319            st.btree_address,
320            self.offset_size,
321            self.length_size,
322            None, // group B-tree, no ndims
323            &[],
324            None,
325        )?;
326
327        let mut children = Vec::new();
328
329        for (_key, snod_address) in &leaves {
330            let mut cursor = Cursor::new(self.file_data);
331            cursor.set_position(*snod_address);
332            let snod = SymbolTableNode::parse(&mut cursor, self.offset_size, self.length_size)?;
333
334            for entry in &snod.entries {
335                let name = heap.get_string(entry.link_name_offset, self.file_data)?;
336                children.push(ChildEntry {
337                    name,
338                    address: entry.object_header_address,
339                });
340            }
341        }
342
343        Ok(children)
344    }
345
346    /// Resolve dense links from a fractal heap + B-tree v2.
347    fn resolve_dense_links(
348        &self,
349        link_info: &LinkInfoMessage,
350        link_depth: u32,
351    ) -> Result<Vec<ChildEntry>> {
352        // Parse the fractal heap at the link_info address.
353        let mut heap_cursor = Cursor::new(self.file_data);
354        heap_cursor.set_position(link_info.fractal_heap_address);
355        let heap = FractalHeap::parse(&mut heap_cursor, self.offset_size, self.length_size)?;
356
357        // Parse the B-tree v2 header at the name index address.
358        let mut btree_cursor = Cursor::new(self.file_data);
359        btree_cursor.set_position(link_info.btree_name_index_address);
360        let btree_header =
361            btree_v2::BTreeV2Header::parse(&mut btree_cursor, self.offset_size, self.length_size)?;
362
363        // Collect all records from the B-tree.
364        let records = btree_v2::collect_btree_v2_records(
365            self.file_data,
366            &btree_header,
367            self.offset_size,
368            self.length_size,
369            None,
370            &[],
371            None,
372        )?;
373
374        let mut children = Vec::new();
375        for record in &records {
376            let heap_id = match record {
377                btree_v2::BTreeV2Record::LinkNameHash { heap_id, .. } => heap_id,
378                btree_v2::BTreeV2Record::CreationOrder { heap_id, .. } => heap_id,
379                _ => continue,
380            };
381
382            // Extract the link message bytes from the fractal heap.
383            let managed_bytes = heap.get_managed_object(
384                heap_id,
385                self.file_data,
386                self.offset_size,
387                self.length_size,
388            )?;
389
390            // Parse the managed bytes as a link message.
391            let mut link_cursor = Cursor::new(&managed_bytes);
392            let link_msg = link::parse(
393                &mut link_cursor,
394                self.offset_size,
395                self.length_size,
396                managed_bytes.len(),
397            )?;
398
399            match &link_msg.target {
400                LinkTarget::Hard { address } => {
401                    children.push(ChildEntry {
402                        name: link_msg.name.clone(),
403                        address: *address,
404                    });
405                }
406                LinkTarget::Soft { path } => {
407                    if let Ok(address) = self.resolve_soft_link_depth(path, link_depth) {
408                        children.push(ChildEntry {
409                            name: link_msg.name.clone(),
410                            address,
411                        });
412                    }
413                }
414                LinkTarget::External { .. } => {
415                    // External links reference other files; skip.
416                }
417            }
418        }
419
420        Ok(children)
421    }
422
423    /// Check if the object at the given address is a group (vs a dataset).
424    /// A group has either a symbol table message, link messages, or link info.
425    /// A dataset has a dataspace + datatype + layout.
426    fn is_group_at(&self, address: u64) -> Result<bool> {
427        let mut header = (*self.cached_header(address)?).clone();
428        header.resolve_shared_messages(self.file_data, self.offset_size, self.length_size)?;
429        for msg in &header.messages {
430            match msg {
431                // Group indicators
432                HdfMessage::SymbolTable(_)
433                | HdfMessage::Link(_)
434                | HdfMessage::LinkInfo(_)
435                | HdfMessage::GroupInfo(_) => return Ok(true),
436                // Dataset indicators
437                HdfMessage::DataLayout(_) => return Ok(false),
438                _ => {}
439            }
440        }
441        // Default: if it has neither, treat as group (root groups can be empty)
442        Ok(true)
443    }
444
445    fn try_open_child_dataset(&self, child: &ChildEntry) -> Option<Dataset<'f>> {
446        let header = self.cached_header(child.address).ok()?;
447        Dataset::from_parsed_header(
448            crate::dataset::DatasetParseContext {
449                file_data: self.file_data,
450                offset_size: self.offset_size,
451                length_size: self.length_size,
452                chunk_cache: self.chunk_cache.clone(),
453                filter_registry: self.filter_registry.clone(),
454            },
455            child.address,
456            child.name.clone(),
457            header.as_ref(),
458        )
459        .ok()
460    }
461
462    fn child_is_group(&self, child: &ChildEntry) -> Result<bool> {
463        match self.is_group_at(child.address) {
464            Ok(is_group) => Ok(is_group),
465            Err(_) => Ok(self.try_open_child_dataset(child).is_none()),
466        }
467    }
468
469    /// Maximum nesting depth for soft link resolution.
470    /// HDF5 C library uses a default of 16.
471    const MAX_SOFT_LINK_DEPTH: u32 = 16;
472
473    fn resolve_soft_link_depth(&self, path: &str, depth: u32) -> Result<u64> {
474        if depth >= Self::MAX_SOFT_LINK_DEPTH {
475            return Err(Error::Other(format!(
476                "soft link resolution exceeded maximum depth ({}) — possible cycle at '{}'",
477                Self::MAX_SOFT_LINK_DEPTH,
478                path,
479            )));
480        }
481
482        let parts: Vec<&str> = path
483            .trim_matches('/')
484            .split('/')
485            .filter(|s| !s.is_empty())
486            .collect();
487
488        if parts.is_empty() {
489            return Ok(self.root_address);
490        }
491
492        // Start from root for absolute paths, from self for relative.
493        let start_addr = if path.starts_with('/') {
494            self.root_address
495        } else {
496            self.address
497        };
498
499        let mut current_group = Group::new(
500            self.file_data,
501            start_addr,
502            String::new(),
503            self.offset_size,
504            self.length_size,
505            self.root_address,
506            self.chunk_cache.clone(),
507            self.header_cache.clone(),
508            self.filter_registry.clone(),
509        );
510
511        // Navigate to the parent of the target
512        for &part in &parts[..parts.len() - 1] {
513            current_group = current_group.group(part)?;
514        }
515
516        // Find the target's address — resolve any soft links encountered along the way
517        let target_name = parts[parts.len() - 1];
518        let children = current_group.resolve_children_with_link_depth(depth + 1)?;
519        for child in &children {
520            if child.name == target_name {
521                return Ok(child.address);
522            }
523        }
524
525        Err(Error::Other(format!(
526            "soft link target '{}' not found",
527            path
528        )))
529    }
530}