Skip to main content

hdf5_reader/
global_heap.rs

1//! HDF5 Global Heap Collection (GCOL).
2//!
3//! Global heaps store variable-length data such as variable-length strings
4//! and VL arrays. Each collection is a contiguous block in the file that
5//! contains multiple heap objects. Objects are referenced by a global heap
6//! ID that encodes the collection address and the object index.
7//!
8//! An object with index 0 marks the free space sentinel and terminates
9//! parsing of the collection.
10
11use crate::error::{Error, Result};
12use crate::io::Cursor;
13
14/// Signature bytes for a Global Heap Collection: ASCII `GCOL`.
15const GCOL_SIGNATURE: [u8; 4] = *b"GCOL";
16
17/// A single object within a global heap collection.
18#[derive(Debug, Clone)]
19pub struct GlobalHeapObject {
20    /// Heap object index (1-based; index 0 is the free-space sentinel).
21    pub index: u16,
22    /// Reference count.
23    pub reference_count: u16,
24    /// Raw object data.
25    pub data: Vec<u8>,
26}
27
28/// A parsed global heap collection containing zero or more heap objects.
29#[derive(Debug, Clone)]
30pub struct GlobalHeapCollection {
31    /// The heap objects in this collection.
32    pub objects: Vec<GlobalHeapObject>,
33}
34
35impl GlobalHeapCollection {
36    /// Parse a global heap collection at the current cursor position.
37    ///
38    /// Format:
39    /// - Signature: `GCOL` (4 bytes)
40    /// - Version: 1 (1 byte)
41    /// - Reserved: 3 bytes
42    /// - Collection size (`length_size` bytes) — total size including header
43    /// - Then global heap objects until the collection is exhausted.
44    ///
45    /// Each global heap object:
46    /// - Heap object index (u16 LE)
47    /// - Reference count (u16 LE)
48    /// - Reserved (4 bytes)
49    /// - Object size (`length_size` bytes)
50    /// - Object data (padded to 8-byte boundary)
51    /// - An index of 0 signals free space / end of objects.
52    pub fn parse(cursor: &mut Cursor, _offset_size: u8, length_size: u8) -> Result<Self> {
53        let header_start = cursor.position();
54
55        let sig = cursor.read_bytes(4)?;
56        if sig != GCOL_SIGNATURE {
57            return Err(Error::InvalidGlobalHeapSignature);
58        }
59
60        let version = cursor.read_u8()?;
61        if version != 1 {
62            return Err(Error::UnsupportedGlobalHeapVersion(version));
63        }
64
65        // Reserved 3 bytes
66        cursor.skip(3)?;
67
68        let collection_size = cursor.read_length(length_size)?;
69
70        // The collection_size includes the header we just read. Calculate the
71        // end boundary so we don't read past it.
72        let collection_end = header_start + collection_size;
73
74        let mut objects = Vec::new();
75
76        loop {
77            // Check if we have enough room for at least an object header
78            // (2 + 2 + 4 + length_size bytes minimum).
79            let min_obj_header = 8 + length_size as u64;
80            if cursor.position() + min_obj_header > collection_end {
81                break;
82            }
83
84            let index = cursor.read_u16_le()?;
85
86            // Index 0 = free space sentinel — stop parsing.
87            if index == 0 {
88                break;
89            }
90
91            let reference_count = cursor.read_u16_le()?;
92            // Reserved 4 bytes
93            cursor.skip(4)?;
94            let obj_size = cursor.read_length(length_size)?;
95
96            // Guard against reading past the collection.
97            if cursor.position() + obj_size > collection_end {
98                return Err(Error::UnexpectedEof {
99                    offset: cursor.position(),
100                    needed: obj_size,
101                    available: collection_end.saturating_sub(cursor.position()),
102                });
103            }
104
105            let data = cursor.read_bytes(obj_size as usize)?.to_vec();
106
107            // Object data is padded to an 8-byte boundary.
108            let padded = (obj_size + 7) & !7;
109            let padding = padded - obj_size;
110            if padding > 0 && cursor.position() + padding <= collection_end {
111                cursor.skip(padding as usize)?;
112            }
113
114            objects.push(GlobalHeapObject {
115                index,
116                reference_count,
117                data,
118            });
119        }
120
121        Ok(GlobalHeapCollection { objects })
122    }
123
124    /// Look up an object by its index within this collection.
125    pub fn get_object(&self, index: u16) -> Option<&GlobalHeapObject> {
126        self.objects.iter().find(|o| o.index == index)
127    }
128}
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133
134    /// Build a minimal global heap collection with the given objects.
135    /// Each object is (index, ref_count, data).
136    fn build_gcol(objects: &[(u16, u16, &[u8])], length_size: u8) -> Vec<u8> {
137        let mut body = Vec::new();
138        for &(index, ref_count, data) in objects {
139            body.extend_from_slice(&index.to_le_bytes());
140            body.extend_from_slice(&ref_count.to_le_bytes());
141            body.extend_from_slice(&[0u8; 4]); // reserved
142            match length_size {
143                4 => body.extend_from_slice(&(data.len() as u32).to_le_bytes()),
144                8 => body.extend_from_slice(&(data.len() as u64).to_le_bytes()),
145                _ => panic!("test only supports 4/8"),
146            }
147            body.extend_from_slice(data);
148            // Pad to 8-byte boundary
149            let padded = (data.len() + 7) & !7;
150            body.resize(body.len() + (padded - data.len()), 0);
151        }
152        // Free space sentinel (index 0)
153        body.extend_from_slice(&0u16.to_le_bytes());
154
155        // Build full collection
156        let header_size = 4 + 1 + 3 + length_size as usize; // sig + ver + reserved + size
157        let collection_size = header_size + body.len();
158
159        let mut buf = Vec::new();
160        buf.extend_from_slice(b"GCOL");
161        buf.push(1); // version
162        buf.extend_from_slice(&[0, 0, 0]); // reserved
163        match length_size {
164            4 => buf.extend_from_slice(&(collection_size as u32).to_le_bytes()),
165            8 => buf.extend_from_slice(&(collection_size as u64).to_le_bytes()),
166            _ => panic!("test only supports 4/8"),
167        }
168        buf.extend(body);
169        buf
170    }
171
172    #[test]
173    fn test_parse_empty_collection() {
174        let data = build_gcol(&[], 8);
175        let mut cursor = Cursor::new(&data);
176        let col = GlobalHeapCollection::parse(&mut cursor, 8, 8).unwrap();
177        assert!(col.objects.is_empty());
178    }
179
180    #[test]
181    fn test_parse_single_object() {
182        let obj_data = b"hello world";
183        let data = build_gcol(&[(1, 1, obj_data)], 8);
184        let mut cursor = Cursor::new(&data);
185        let col = GlobalHeapCollection::parse(&mut cursor, 8, 8).unwrap();
186
187        assert_eq!(col.objects.len(), 1);
188        assert_eq!(col.objects[0].index, 1);
189        assert_eq!(col.objects[0].reference_count, 1);
190        assert_eq!(col.objects[0].data, obj_data);
191    }
192
193    #[test]
194    fn test_parse_multiple_objects() {
195        let data = build_gcol(
196            &[
197                (1, 1, b"alpha"),
198                (2, 3, b"beta"),
199                (5, 0, b"gamma123"), // 8 bytes, no padding needed
200            ],
201            8,
202        );
203        let mut cursor = Cursor::new(&data);
204        let col = GlobalHeapCollection::parse(&mut cursor, 8, 8).unwrap();
205
206        assert_eq!(col.objects.len(), 3);
207
208        let obj1 = col.get_object(1).unwrap();
209        assert_eq!(obj1.data, b"alpha");
210        assert_eq!(obj1.reference_count, 1);
211
212        let obj2 = col.get_object(2).unwrap();
213        assert_eq!(obj2.data, b"beta");
214        assert_eq!(obj2.reference_count, 3);
215
216        let obj5 = col.get_object(5).unwrap();
217        assert_eq!(obj5.data, b"gamma123");
218
219        assert!(col.get_object(99).is_none());
220    }
221
222    #[test]
223    fn test_parse_4byte_lengths() {
224        let data = build_gcol(&[(1, 2, b"test")], 4);
225        let mut cursor = Cursor::new(&data);
226        let col = GlobalHeapCollection::parse(&mut cursor, 4, 4).unwrap();
227
228        assert_eq!(col.objects.len(), 1);
229        assert_eq!(col.objects[0].data, b"test");
230    }
231
232    #[test]
233    fn test_bad_signature() {
234        let mut data = build_gcol(&[], 8);
235        data[0] = b'X';
236        let mut cursor = Cursor::new(&data);
237        assert!(matches!(
238            GlobalHeapCollection::parse(&mut cursor, 8, 8),
239            Err(Error::InvalidGlobalHeapSignature)
240        ));
241    }
242
243    #[test]
244    fn test_bad_version() {
245        let mut data = build_gcol(&[], 8);
246        data[4] = 2; // version 2
247        let mut cursor = Cursor::new(&data);
248        assert!(matches!(
249            GlobalHeapCollection::parse(&mut cursor, 8, 8),
250            Err(Error::UnsupportedGlobalHeapVersion(2))
251        ));
252    }
253}