Skip to main content

hdf5_reader/
global_heap.rs

1//! HDF5 Global Heap Collection (GCOL).
2//!
3//! Global heaps store variable-length data such as variable-length strings
4//! and VL arrays. Each collection is a contiguous block in the file that
5//! contains multiple heap objects. Objects are referenced by a global heap
6//! ID that encodes the collection address and the object index.
7//!
8//! An object with index 0 marks the free space sentinel and terminates
9//! parsing of the collection.
10
11use crate::error::{Error, Result};
12use crate::io::Cursor;
13use crate::storage::Storage;
14
15/// Signature bytes for a Global Heap Collection: ASCII `GCOL`.
16const GCOL_SIGNATURE: [u8; 4] = *b"GCOL";
17
18/// A single object within a global heap collection.
19#[derive(Debug, Clone)]
20pub struct GlobalHeapObject {
21    /// Heap object index (1-based; index 0 is the free-space sentinel).
22    pub index: u16,
23    /// Reference count.
24    pub reference_count: u16,
25    /// Raw object data.
26    pub data: Vec<u8>,
27}
28
29/// A parsed global heap collection containing zero or more heap objects.
30#[derive(Debug, Clone)]
31pub struct GlobalHeapCollection {
32    /// The heap objects in this collection.
33    pub objects: Vec<GlobalHeapObject>,
34}
35
36impl GlobalHeapCollection {
37    /// Parse a global heap collection at the current cursor position.
38    ///
39    /// Format:
40    /// - Signature: `GCOL` (4 bytes)
41    /// - Version: 1 (1 byte)
42    /// - Reserved: 3 bytes
43    /// - Collection size (`length_size` bytes) — total size including header
44    /// - Then global heap objects until the collection is exhausted.
45    ///
46    /// Each global heap object:
47    /// - Heap object index (u16 LE)
48    /// - Reference count (u16 LE)
49    /// - Reserved (4 bytes)
50    /// - Object size (`length_size` bytes)
51    /// - Object data (padded to 8-byte boundary)
52    /// - An index of 0 signals free space / end of objects.
53    pub fn parse(cursor: &mut Cursor, _offset_size: u8, length_size: u8) -> Result<Self> {
54        let header_start = cursor.position();
55
56        let sig = cursor.read_bytes(4)?;
57        if sig != GCOL_SIGNATURE {
58            return Err(Error::InvalidGlobalHeapSignature);
59        }
60
61        let version = cursor.read_u8()?;
62        if version != 1 {
63            return Err(Error::UnsupportedGlobalHeapVersion(version));
64        }
65
66        // Reserved 3 bytes
67        cursor.skip(3)?;
68
69        let collection_size = cursor.read_length(length_size)?;
70
71        // The collection_size includes the header we just read. Calculate the
72        // end boundary so we don't read past it.
73        let collection_end = header_start + collection_size;
74
75        let mut objects = Vec::new();
76
77        loop {
78            // Check if we have enough room for at least an object header
79            // (2 + 2 + 4 + length_size bytes minimum).
80            let min_obj_header = 8 + length_size as u64;
81            if cursor.position() + min_obj_header > collection_end {
82                break;
83            }
84
85            let index = cursor.read_u16_le()?;
86
87            // Index 0 = free space sentinel — stop parsing.
88            if index == 0 {
89                break;
90            }
91
92            let reference_count = cursor.read_u16_le()?;
93            // Reserved 4 bytes
94            cursor.skip(4)?;
95            let obj_size = cursor.read_length(length_size)?;
96
97            // Guard against reading past the collection.
98            if cursor.position() + obj_size > collection_end {
99                return Err(Error::UnexpectedEof {
100                    offset: cursor.position(),
101                    needed: obj_size,
102                    available: collection_end.saturating_sub(cursor.position()),
103                });
104            }
105
106            let data = cursor.read_bytes(obj_size as usize)?.to_vec();
107
108            // Object data is padded to an 8-byte boundary.
109            let padded = (obj_size + 7) & !7;
110            let padding = padded - obj_size;
111            if padding > 0 && cursor.position() + padding <= collection_end {
112                cursor.skip(padding as usize)?;
113            }
114
115            objects.push(GlobalHeapObject {
116                index,
117                reference_count,
118                data,
119            });
120        }
121
122        Ok(GlobalHeapCollection { objects })
123    }
124
125    /// Parse a global heap collection from random-access storage.
126    pub fn parse_at_storage(
127        storage: &dyn Storage,
128        address: u64,
129        offset_size: u8,
130        length_size: u8,
131    ) -> Result<Self> {
132        let header_len = 4 + 1 + 3 + usize::from(length_size);
133        let header = storage.read_range(address, header_len)?;
134        let mut cursor = Cursor::new(header.as_ref());
135        let sig = cursor.read_bytes(4)?;
136        if sig != GCOL_SIGNATURE {
137            return Err(Error::InvalidGlobalHeapSignature);
138        }
139
140        let version = cursor.read_u8()?;
141        if version != 1 {
142            return Err(Error::UnsupportedGlobalHeapVersion(version));
143        }
144
145        cursor.skip(3)?;
146        let collection_size = cursor.read_length(length_size)?;
147        let collection_len = usize::try_from(collection_size).map_err(|_| {
148            Error::InvalidData("global heap collection exceeds platform usize capacity".into())
149        })?;
150        let bytes = storage.read_range(address, collection_len)?;
151        let mut full_cursor = Cursor::new(bytes.as_ref());
152        Self::parse(&mut full_cursor, offset_size, length_size)
153    }
154
155    /// Look up an object by its index within this collection.
156    pub fn get_object(&self, index: u16) -> Option<&GlobalHeapObject> {
157        self.objects.iter().find(|o| o.index == index)
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    /// Build a minimal global heap collection with the given objects.
166    /// Each object is (index, ref_count, data).
167    fn build_gcol(objects: &[(u16, u16, &[u8])], length_size: u8) -> Vec<u8> {
168        let mut body = Vec::new();
169        for &(index, ref_count, data) in objects {
170            body.extend_from_slice(&index.to_le_bytes());
171            body.extend_from_slice(&ref_count.to_le_bytes());
172            body.extend_from_slice(&[0u8; 4]); // reserved
173            match length_size {
174                4 => body.extend_from_slice(&(data.len() as u32).to_le_bytes()),
175                8 => body.extend_from_slice(&(data.len() as u64).to_le_bytes()),
176                _ => panic!("test only supports 4/8"),
177            }
178            body.extend_from_slice(data);
179            // Pad to 8-byte boundary
180            let padded = (data.len() + 7) & !7;
181            body.resize(body.len() + (padded - data.len()), 0);
182        }
183        // Free space sentinel (index 0)
184        body.extend_from_slice(&0u16.to_le_bytes());
185
186        // Build full collection
187        let header_size = 4 + 1 + 3 + length_size as usize; // sig + ver + reserved + size
188        let collection_size = header_size + body.len();
189
190        let mut buf = Vec::new();
191        buf.extend_from_slice(b"GCOL");
192        buf.push(1); // version
193        buf.extend_from_slice(&[0, 0, 0]); // reserved
194        match length_size {
195            4 => buf.extend_from_slice(&(collection_size as u32).to_le_bytes()),
196            8 => buf.extend_from_slice(&(collection_size as u64).to_le_bytes()),
197            _ => panic!("test only supports 4/8"),
198        }
199        buf.extend(body);
200        buf
201    }
202
203    #[test]
204    fn test_parse_empty_collection() {
205        let data = build_gcol(&[], 8);
206        let mut cursor = Cursor::new(&data);
207        let col = GlobalHeapCollection::parse(&mut cursor, 8, 8).unwrap();
208        assert!(col.objects.is_empty());
209    }
210
211    #[test]
212    fn test_parse_single_object() {
213        let obj_data = b"hello world";
214        let data = build_gcol(&[(1, 1, obj_data)], 8);
215        let mut cursor = Cursor::new(&data);
216        let col = GlobalHeapCollection::parse(&mut cursor, 8, 8).unwrap();
217
218        assert_eq!(col.objects.len(), 1);
219        assert_eq!(col.objects[0].index, 1);
220        assert_eq!(col.objects[0].reference_count, 1);
221        assert_eq!(col.objects[0].data, obj_data);
222    }
223
224    #[test]
225    fn test_parse_multiple_objects() {
226        let data = build_gcol(
227            &[
228                (1, 1, b"alpha"),
229                (2, 3, b"beta"),
230                (5, 0, b"gamma123"), // 8 bytes, no padding needed
231            ],
232            8,
233        );
234        let mut cursor = Cursor::new(&data);
235        let col = GlobalHeapCollection::parse(&mut cursor, 8, 8).unwrap();
236
237        assert_eq!(col.objects.len(), 3);
238
239        let obj1 = col.get_object(1).unwrap();
240        assert_eq!(obj1.data, b"alpha");
241        assert_eq!(obj1.reference_count, 1);
242
243        let obj2 = col.get_object(2).unwrap();
244        assert_eq!(obj2.data, b"beta");
245        assert_eq!(obj2.reference_count, 3);
246
247        let obj5 = col.get_object(5).unwrap();
248        assert_eq!(obj5.data, b"gamma123");
249
250        assert!(col.get_object(99).is_none());
251    }
252
253    #[test]
254    fn test_parse_4byte_lengths() {
255        let data = build_gcol(&[(1, 2, b"test")], 4);
256        let mut cursor = Cursor::new(&data);
257        let col = GlobalHeapCollection::parse(&mut cursor, 4, 4).unwrap();
258
259        assert_eq!(col.objects.len(), 1);
260        assert_eq!(col.objects[0].data, b"test");
261    }
262
263    #[test]
264    fn test_bad_signature() {
265        let mut data = build_gcol(&[], 8);
266        data[0] = b'X';
267        let mut cursor = Cursor::new(&data);
268        assert!(matches!(
269            GlobalHeapCollection::parse(&mut cursor, 8, 8),
270            Err(Error::InvalidGlobalHeapSignature)
271        ));
272    }
273
274    #[test]
275    fn test_bad_version() {
276        let mut data = build_gcol(&[], 8);
277        data[4] = 2; // version 2
278        let mut cursor = Cursor::new(&data);
279        assert!(matches!(
280            GlobalHeapCollection::parse(&mut cursor, 8, 8),
281            Err(Error::UnsupportedGlobalHeapVersion(2))
282        ));
283    }
284}