Skip to main content

async_hdf5/
group.rs

1use std::sync::Arc;
2
3use bytes::Bytes;
4
5use crate::dataset::HDF5Dataset;
6use crate::endian::HDF5Reader;
7use crate::error::{HDF5Error, Result};
8use crate::file::read_object_header;
9use crate::messages::attribute::{Attribute, AttributeMessage};
10use crate::messages::link::{LinkMessage, LinkType};
11use crate::messages::link_info::LinkInfoMessage;
12use crate::messages::symbol_table::SymbolTableMessage;
13use crate::object_header::{msg_types, ObjectHeader};
14use crate::reader::AsyncFileReader;
15use crate::superblock::Superblock;
16use crate::{btree, heap};
17
18/// A named link to a child object (group or dataset).
19#[derive(Debug, Clone)]
20pub struct ChildLink {
21    /// Link name.
22    pub name: String,
23    /// File address of the child's object header.
24    pub address: u64,
25}
26
27/// An HDF5 group — a container for datasets and other groups.
28///
29/// Groups can be navigated by name (like a filesystem). Internally, HDF5 uses
30/// two completely different mechanisms depending on the object header version:
31///
32/// - **v1 groups**: Symbol table message → B-tree v1 (type 0) + local heap for names
33/// - **v2 groups**: Inline link messages (small groups) or link info message →
34///   fractal heap + B-tree v2 (large groups)
35#[derive(Debug)]
36pub struct HDF5Group {
37    name: String,
38    header: ObjectHeader,
39    reader: Arc<dyn AsyncFileReader>,
40    raw_reader: Arc<dyn AsyncFileReader>,
41    superblock: Arc<Superblock>,
42}
43
44impl HDF5Group {
45    /// Create a new group from its parsed object header.
46    pub fn new(
47        name: String,
48        header: ObjectHeader,
49        reader: Arc<dyn AsyncFileReader>,
50        raw_reader: Arc<dyn AsyncFileReader>,
51        superblock: Arc<Superblock>,
52    ) -> Self {
53        Self {
54            name,
55            header,
56            reader,
57            raw_reader,
58            superblock,
59        }
60    }
61
62    /// The group's name (not the full path).
63    pub fn name(&self) -> &str {
64        &self.name
65    }
66
67    /// Access the object header.
68    pub fn header(&self) -> &ObjectHeader {
69        &self.header
70    }
71
72    /// List all child links in this group.
73    pub async fn children(&self) -> Result<Vec<ChildLink>> {
74        // Try v2 first: check for inline link messages
75        let link_msgs = self.header.find_messages(msg_types::LINK);
76        if !link_msgs.is_empty() {
77            return self.children_from_link_messages(&link_msgs);
78        }
79
80        // Try v2 dense: check for link info message
81        if let Some(link_info_msg) = self.header.find_message(msg_types::LINK_INFO) {
82            return self.children_from_link_info(&link_info_msg.data).await;
83        }
84
85        // Try v1: check for symbol table message
86        if let Some(sym_msg) = self.header.find_message(msg_types::SYMBOL_TABLE) {
87            return self.children_from_symbol_table(&sym_msg.data).await;
88        }
89
90        // No children
91        Ok(vec![])
92    }
93
94    /// Get a child group by name.
95    pub async fn group(&self, name: &str) -> Result<HDF5Group> {
96        let children = self.children().await?;
97        let child = children
98            .iter()
99            .find(|c| c.name == name)
100            .ok_or_else(|| HDF5Error::NotFound(name.to_string()))?;
101
102        let header = read_object_header(
103            &self.reader,
104            child.address,
105            self.superblock.size_of_offsets,
106            self.superblock.size_of_lengths,
107        )
108        .await?;
109
110        // Verify it's actually a group (has link messages, link info, or symbol table)
111        let is_group = header.find_message(msg_types::LINK).is_some()
112            || header.find_message(msg_types::LINK_INFO).is_some()
113            || header.find_message(msg_types::SYMBOL_TABLE).is_some()
114            // A group may also have no children but have group info
115            || header.find_message(msg_types::GROUP_INFO).is_some()
116            // An empty v2 group might only have nil messages
117            || !header
118                .messages
119                .iter()
120                .any(|m| m.msg_type == msg_types::DATASPACE);
121
122        if !is_group {
123            return Err(HDF5Error::NotAGroup(name.to_string()));
124        }
125
126        Ok(HDF5Group::new(
127            name.to_string(),
128            header,
129            Arc::clone(&self.reader),
130            Arc::clone(&self.raw_reader),
131            Arc::clone(&self.superblock),
132        ))
133    }
134
135    /// Get a child dataset by name.
136    pub async fn dataset(&self, name: &str) -> Result<HDF5Dataset> {
137        let children = self.children().await?;
138        let child = children
139            .iter()
140            .find(|c| c.name == name)
141            .ok_or_else(|| HDF5Error::NotFound(name.to_string()))?;
142
143        let header = read_object_header(
144            &self.reader,
145            child.address,
146            self.superblock.size_of_offsets,
147            self.superblock.size_of_lengths,
148        )
149        .await?;
150
151        // Verify it's a dataset (has dataspace message)
152        if header.find_message(msg_types::DATASPACE).is_none() {
153            return Err(HDF5Error::NotADataset(name.to_string()));
154        }
155
156        HDF5Dataset::new(
157            name.to_string(),
158            header,
159            Arc::clone(&self.reader),
160            Arc::clone(&self.raw_reader),
161            Arc::clone(&self.superblock),
162        )
163    }
164
165    /// Get a child's object header by name (returns the header without
166    /// assuming whether it's a group or dataset).
167    pub async fn child_header(&self, name: &str) -> Result<(u64, ObjectHeader)> {
168        let children = self.children().await?;
169        let child = children
170            .iter()
171            .find(|c| c.name == name)
172            .ok_or_else(|| HDF5Error::NotFound(name.to_string()))?;
173
174        let header = read_object_header(
175            &self.reader,
176            child.address,
177            self.superblock.size_of_offsets,
178            self.superblock.size_of_lengths,
179        )
180        .await?;
181
182        Ok((child.address, header))
183    }
184
185    /// Navigate to a group by slash-separated path (e.g., "science/LSAR/GCOV").
186    pub async fn navigate(&self, path: &str) -> Result<HDF5Group> {
187        let parts: Vec<&str> = path
188            .trim_matches('/')
189            .split('/')
190            .filter(|s| !s.is_empty())
191            .collect();
192
193        let mut current = HDF5Group::new(
194            self.name.clone(),
195            self.header.clone(),
196            Arc::clone(&self.reader),
197            Arc::clone(&self.raw_reader),
198            Arc::clone(&self.superblock),
199        );
200
201        for part in parts {
202            current = current.group(part).await?;
203        }
204
205        Ok(current)
206    }
207
208    /// List all child group names.
209    pub async fn group_names(&self) -> Result<Vec<String>> {
210        let children = self.children().await?;
211        let mut group_names = Vec::new();
212
213        for child in &children {
214            let header = read_object_header(
215                &self.reader,
216                child.address,
217                self.superblock.size_of_offsets,
218                self.superblock.size_of_lengths,
219            )
220            .await?;
221
222            // Check if this is a group (has group-like messages, does NOT have dataspace)
223            let has_dataspace = header.find_message(msg_types::DATASPACE).is_some();
224            if !has_dataspace {
225                group_names.push(child.name.clone());
226            }
227        }
228
229        Ok(group_names)
230    }
231
232    /// List all child dataset names.
233    pub async fn dataset_names(&self) -> Result<Vec<String>> {
234        let children = self.children().await?;
235        let mut dataset_names = Vec::new();
236
237        for child in &children {
238            let header = read_object_header(
239                &self.reader,
240                child.address,
241                self.superblock.size_of_offsets,
242                self.superblock.size_of_lengths,
243            )
244            .await?;
245
246            // Datasets have a dataspace message
247            if header.find_message(msg_types::DATASPACE).is_some() {
248                dataset_names.push(child.name.clone());
249            }
250        }
251
252        Ok(dataset_names)
253    }
254
255    /// Access the reader.
256    pub fn reader(&self) -> &Arc<dyn AsyncFileReader> {
257        &self.reader
258    }
259
260    /// Access the superblock.
261    pub fn superblock(&self) -> &Arc<Superblock> {
262        &self.superblock
263    }
264
265    /// Get all attributes attached to this group, resolving vlen data.
266    pub async fn attributes(&self) -> Vec<Attribute> {
267        attributes_from_header(
268            &self.header,
269            &self.reader,
270            self.superblock.size_of_offsets,
271            self.superblock.size_of_lengths,
272        )
273        .await
274    }
275
276    /// Get a single attribute by name.
277    pub async fn attribute(&self, name: &str) -> Option<Attribute> {
278        self.attributes().await.into_iter().find(|a| a.name == name)
279    }
280
281    // ── Private helpers ────────────────────────────────────────────────────
282
283    /// Extract children from inline v2 link messages.
284    fn children_from_link_messages(
285        &self,
286        link_msgs: &[&crate::object_header::HeaderMessage],
287    ) -> Result<Vec<ChildLink>> {
288        let mut children = Vec::with_capacity(link_msgs.len());
289
290        for msg in link_msgs {
291            let link = LinkMessage::parse(
292                &msg.data,
293                self.superblock.size_of_offsets,
294                self.superblock.size_of_lengths,
295            )?;
296
297            if link.link_type == LinkType::Hard {
298                if let Some(addr) = link.target_address {
299                    children.push(ChildLink {
300                        name: link.name,
301                        address: addr,
302                    });
303                }
304            }
305            // Skip soft/external links for now
306        }
307
308        Ok(children)
309    }
310
311    /// Extract children from dense link storage (link info → fractal heap + B-tree v2).
312    async fn children_from_link_info(&self, data: &Bytes) -> Result<Vec<ChildLink>> {
313        let link_info = LinkInfoMessage::parse(
314            data,
315            self.superblock.size_of_offsets,
316            self.superblock.size_of_lengths,
317        )?;
318
319        // Check if the fractal heap address is defined
320        if HDF5Reader::is_undef_addr(
321            link_info.fractal_heap_address,
322            self.superblock.size_of_offsets,
323        ) {
324            return Ok(vec![]);
325        }
326
327        // Read the fractal heap
328        let fheap = heap::fractal::FractalHeap::read(
329            &self.reader,
330            link_info.fractal_heap_address,
331            self.superblock.size_of_offsets,
332            self.superblock.size_of_lengths,
333        )
334        .await?;
335
336        // Read the B-tree v2 header for the name index
337        let btree_header = btree::v2::BTreeV2Header::read(
338            &self.reader,
339            link_info.name_btree_address,
340            self.superblock.size_of_offsets,
341            self.superblock.size_of_lengths,
342        )
343        .await?;
344
345        // Collect all records from the B-tree
346        let raw_records = btree::v2::collect_all_records(
347            &self.reader,
348            &btree_header,
349            self.superblock.size_of_offsets,
350            self.superblock.size_of_lengths,
351        )
352        .await?;
353
354        // Parse link records
355        let link_records = btree::v2::parse_link_records(&raw_records, btree_header.record_type)?;
356
357        // Resolve each link record through the fractal heap
358        let mut children = Vec::with_capacity(link_records.len());
359        for record in &link_records {
360            let link_msg_bytes = fheap.get_object(&record.heap_id).await?;
361            let link = LinkMessage::parse(
362                &link_msg_bytes,
363                self.superblock.size_of_offsets,
364                self.superblock.size_of_lengths,
365            )?;
366
367            if link.link_type == LinkType::Hard {
368                if let Some(addr) = link.target_address {
369                    children.push(ChildLink {
370                        name: link.name,
371                        address: addr,
372                    });
373                }
374            }
375        }
376
377        Ok(children)
378    }
379
380    /// Extract children from v1 symbol table (B-tree v1 + local heap).
381    async fn children_from_symbol_table(&self, data: &Bytes) -> Result<Vec<ChildLink>> {
382        let sym_table = SymbolTableMessage::parse(
383            data,
384            self.superblock.size_of_offsets,
385            self.superblock.size_of_lengths,
386        )?;
387
388        // Read the local heap
389        let local_heap = heap::local::LocalHeap::read(
390            &self.reader,
391            sym_table.local_heap_address,
392            self.superblock.size_of_offsets,
393            self.superblock.size_of_lengths,
394        )
395        .await?;
396
397        // Traverse the B-tree v1
398        let entries = btree::v1::read_group_btree_v1(
399            &self.reader,
400            sym_table.btree_address,
401            &local_heap,
402            self.superblock.size_of_offsets,
403            self.superblock.size_of_lengths,
404        )
405        .await?;
406
407        Ok(entries
408            .into_iter()
409            .filter(|e| e.cache_type != 2) // skip symbolic links
410            .map(|e| ChildLink {
411                name: e.name,
412                address: e.object_header_address,
413            })
414            .collect())
415    }
416}
417
418/// Extract decoded attributes from inline attribute messages in an object header.
419///
420/// Resolves variable-length data (e.g., vlen strings) via the global heap.
421pub(crate) async fn attributes_from_header(
422    header: &ObjectHeader,
423    reader: &Arc<dyn AsyncFileReader>,
424    size_of_offsets: u8,
425    size_of_lengths: u8,
426) -> Vec<Attribute> {
427    let mut attrs = Vec::new();
428    for msg in header.find_messages(msg_types::ATTRIBUTE) {
429        if let Ok(am) = AttributeMessage::parse(&msg.data, size_of_offsets, size_of_lengths) {
430            match am
431                .to_attribute_resolved(reader, size_of_offsets, size_of_lengths)
432                .await
433            {
434                Ok(attr) => attrs.push(attr),
435                Err(_) => {
436                    // Fall back to non-resolved decode
437                    attrs.push(am.to_attribute());
438                }
439            }
440        }
441    }
442    attrs
443}