Skip to main content

rust_hdf5/format/
global_heap.rs

1//! Global Heap Collection (GCOL) -- stores variable-length data such as
2//! variable-length strings.
3//!
4//! Binary layout of a Global Heap Collection:
5//! ```text
6//! "GCOL"              (4 bytes, signature)
7//! version             (1 byte, must be 1)
8//! reserved            (3 bytes)
9//! collection_size     (sizeof_size bytes LE, total including header)
10//!
11//! Followed by heap objects:
12//!   index             (u16 LE, 0 = free space / end marker, 1+ = object)
13//!   ref_count          (u16 LE)
14//!   reserved           (u32 LE)
15//!   size               (sizeof_size bytes LE)
16//!   data               (size bytes, padded to 8-byte alignment)
17//! ```
18//!
19//! A variable-length reference stored in dataset raw data is:
20//! ```text
21//! sequence_length     (u32 LE, length of the vlen sequence)
22//! collection_address  (sizeof_addr bytes LE, address of the GCOL)
23//! object_index        (u32 LE, index within the collection)
24//! ```
25//! Total vlen reference size = 4 + sizeof_addr + 4 bytes.
26
27use crate::format::bytes::read_le_uint as read_size;
28use crate::format::{FormatContext, FormatError, FormatResult};
29
30/// Signature for a global heap collection.
31const GCOL_SIGNATURE: [u8; 4] = *b"GCOL";
32
33/// Global heap collection version.
34const GCOL_VERSION: u8 = 1;
35
36/// Minimum collection size required by the HDF5 C library (H5HG_MINALLOC).
37const GCOL_MIN_SIZE: usize = 4096;
38
39/// A single object within a global heap collection.
40#[derive(Debug, Clone, PartialEq, Eq)]
41pub struct GlobalHeapObject {
42    /// Object index (1-based). Index 0 is reserved for the free-space marker.
43    pub index: u16,
44    /// Raw data stored in this object.
45    pub data: Vec<u8>,
46}
47
48/// A global heap collection, containing a set of heap objects.
49#[derive(Debug, Clone, PartialEq, Eq)]
50pub struct GlobalHeapCollection {
51    /// The heap objects in this collection (index > 0).
52    pub objects: Vec<GlobalHeapObject>,
53}
54
55impl GlobalHeapCollection {
56    /// Create an empty global heap collection.
57    pub fn new() -> Self {
58        Self {
59            objects: Vec::new(),
60        }
61    }
62
63    /// Add a data blob to the collection. Returns the 1-based object index.
64    pub fn add_object(&mut self, data: Vec<u8>) -> FormatResult<u16> {
65        let max_index = self.objects.iter().map(|o| o.index).max().unwrap_or(0);
66        // Object index 0 is the reserved free-space marker, so the usable
67        // range is 1..=u16::MAX. Refuse to wrap past it.
68        if max_index == u16::MAX {
69            return Err(FormatError::InvalidData(
70                "global heap collection is full (65535 objects)".into(),
71            ));
72        }
73        let index = max_index + 1;
74        self.objects.push(GlobalHeapObject { index, data });
75        Ok(index)
76    }
77
78    /// Retrieve the data for an object by its 1-based index.
79    pub fn get_object(&self, index: u16) -> Option<&[u8]> {
80        self.objects
81            .iter()
82            .find(|o| o.index == index)
83            .map(|o| o.data.as_slice())
84    }
85
86    /// Encode the collection into a byte vector.
87    ///
88    /// The encoded blob includes the GCOL header and all heap objects,
89    /// followed by a free-space marker (index=0 object).
90    /// The total size is padded to at least 4096 bytes (H5HG_MINALLOC)
91    /// for compatibility with the HDF5 C library.
92    pub fn encode(&self, ctx: &FormatContext) -> Vec<u8> {
93        let ss = ctx.sizeof_size as usize;
94
95        // libhdf5 (H5HGpkg.h) 8-byte-aligns both the collection header and
96        // every object header (H5HG_ALIGN). For ss == 8 the raw sizes are
97        // already multiples of 8, so this is a no-op there and only matters
98        // for files with 4-byte lengths.
99        let header_size = pad_to_8(4 + 1 + 3 + ss); // GCOL + version + reserved + collection_size
100        let objhdr_size = pad_to_8(2 + 2 + 4 + ss); // index + ref_count + reserved + size
101        let mut objects_size: usize = 0;
102        for obj in &self.objects {
103            objects_size += objhdr_size + pad_to_8(obj.data.len());
104        }
105        // Free-space marker carries the same (aligned) object header.
106        let free_marker_size = objhdr_size;
107        let content_size = header_size + objects_size + free_marker_size;
108
109        // HDF5 C library requires collection_size >= 4096 (H5HG_MINALLOC)
110        let collection_size = content_size.max(GCOL_MIN_SIZE);
111        // HDF5 convention: free marker size = collection_size - header - objects
112        // (includes the free marker's own header in the "free space")
113        let free_space = collection_size - header_size - objects_size;
114
115        let mut buf = Vec::with_capacity(collection_size);
116
117        // Header
118        buf.extend_from_slice(&GCOL_SIGNATURE);
119        buf.push(GCOL_VERSION);
120        buf.extend_from_slice(&[0u8; 3]); // reserved
121        buf.extend_from_slice(&(collection_size as u64).to_le_bytes()[..ss]);
122        buf.resize(header_size, 0); // pad header to 8-byte alignment
123
124        // Objects
125        for obj in &self.objects {
126            let obj_start = buf.len();
127            buf.extend_from_slice(&obj.index.to_le_bytes());
128            buf.extend_from_slice(&1u16.to_le_bytes()); // ref_count = 1
129            buf.extend_from_slice(&0u32.to_le_bytes()); // reserved
130            buf.extend_from_slice(&(obj.data.len() as u64).to_le_bytes()[..ss]);
131            buf.resize(obj_start + objhdr_size, 0); // pad object header
132            buf.extend_from_slice(&obj.data);
133            buf.resize(buf.len() + (pad_to_8(obj.data.len()) - obj.data.len()), 0);
134        }
135
136        // Free-space marker (index = 0) with remaining space
137        buf.extend_from_slice(&0u16.to_le_bytes()); // index = 0
138        buf.extend_from_slice(&0u16.to_le_bytes()); // ref_count = 0
139        buf.extend_from_slice(&0u32.to_le_bytes()); // reserved
140        buf.extend_from_slice(&(free_space as u64).to_le_bytes()[..ss]); // free space size
141
142        // Zero-fill remaining space
143        buf.resize(collection_size, 0);
144
145        debug_assert_eq!(buf.len(), collection_size);
146        buf
147    }
148
149    /// Decode a global heap collection from a byte buffer.
150    ///
151    /// Returns the collection and the number of bytes consumed.
152    pub fn decode(buf: &[u8], ctx: &FormatContext) -> FormatResult<(Self, usize)> {
153        let ss = ctx.sizeof_size as usize;
154        let header_size = pad_to_8(4 + 1 + 3 + ss);
155        let objhdr_size = pad_to_8(2 + 2 + 4 + ss);
156
157        if buf.len() < header_size {
158            return Err(FormatError::BufferTooShort {
159                needed: header_size,
160                available: buf.len(),
161            });
162        }
163
164        // Signature
165        if buf[0..4] != GCOL_SIGNATURE {
166            return Err(FormatError::InvalidSignature);
167        }
168
169        // Version
170        let version = buf[4];
171        if version != GCOL_VERSION {
172            return Err(FormatError::InvalidVersion(version));
173        }
174
175        // Reserved (bytes 5..8) -- skip
176
177        // Collection size
178        let collection_size = read_size(&buf[8..], ss) as usize;
179
180        if buf.len() < collection_size {
181            return Err(FormatError::BufferTooShort {
182                needed: collection_size,
183                available: buf.len(),
184            });
185        }
186
187        // Parse objects
188        let mut pos = header_size;
189        let mut objects = Vec::new();
190
191        while pos + objhdr_size <= collection_size {
192            let obj_start = pos;
193            let index = u16::from_le_bytes([buf[pos], buf[pos + 1]]);
194            pos += 2;
195            let _ref_count = u16::from_le_bytes([buf[pos], buf[pos + 1]]);
196            pos += 2;
197            let _reserved =
198                u32::from_le_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]);
199            pos += 4;
200            let size = read_size(&buf[pos..], ss) as usize;
201            // Skip any object-header alignment padding.
202            pos = obj_start + objhdr_size;
203
204            if index == 0 {
205                // Free-space marker -- end of used objects
206                break;
207            }
208
209            // `size` is a file field up to 8 bytes wide; use a checked add
210            // so a crafted value cannot wrap `pos + size` into a small (or
211            // `< pos`) end offset that bypasses the bound check or panics
212            // the slice below.
213            let obj_end = pos
214                .checked_add(size)
215                .filter(|&end| end <= collection_size)
216                .ok_or_else(|| {
217                    FormatError::InvalidData(format!(
218                        "global heap object {} extends past collection boundary",
219                        index,
220                    ))
221                })?;
222
223            let data = buf[pos..obj_end].to_vec();
224            let padded = pad_to_8(size);
225            pos += padded;
226
227            objects.push(GlobalHeapObject { index, data });
228        }
229
230        Ok((Self { objects }, collection_size))
231    }
232}
233
234impl Default for GlobalHeapCollection {
235    fn default() -> Self {
236        Self::new()
237    }
238}
239
240/// Encode a variable-length reference (used in dataset raw data).
241///
242/// On-disk format per element:
243///   sequence_length (u32 LE) + collection_address (sizeof_addr bytes) + object_index (u32 LE).
244pub fn encode_vlen_reference(
245    sequence_length: u32,
246    collection_addr: u64,
247    object_index: u32,
248    ctx: &FormatContext,
249) -> Vec<u8> {
250    let sa = ctx.sizeof_addr as usize;
251    let mut buf = Vec::with_capacity(4 + sa + 4);
252    buf.extend_from_slice(&sequence_length.to_le_bytes());
253    buf.extend_from_slice(&collection_addr.to_le_bytes()[..sa]);
254    buf.extend_from_slice(&object_index.to_le_bytes());
255    buf
256}
257
258/// Decode a variable-length reference from dataset raw data.
259///
260/// Returns `(sequence_length, collection_address, object_index)`.
261pub fn decode_vlen_reference(buf: &[u8], ctx: &FormatContext) -> FormatResult<(u32, u64, u32)> {
262    let sa = ctx.sizeof_addr as usize;
263    let total = 4 + sa + 4;
264    if buf.len() < total {
265        return Err(FormatError::BufferTooShort {
266            needed: total,
267            available: buf.len(),
268        });
269    }
270    let seq_len = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]);
271    let addr = read_size(&buf[4..], sa);
272    let index = u32::from_le_bytes([
273        buf[4 + sa],
274        buf[4 + sa + 1],
275        buf[4 + sa + 2],
276        buf[4 + sa + 3],
277    ]);
278    Ok((seq_len, addr, index))
279}
280
281/// Return the size of a vlen reference in bytes: 4 + sizeof_addr + 4.
282pub fn vlen_reference_size(ctx: &FormatContext) -> usize {
283    4 + ctx.sizeof_addr as usize + 4
284}
285
286/// Round `n` up to the next multiple of 8.
287fn pad_to_8(n: usize) -> usize {
288    (n + 7) & !7
289}
290
291// ======================================================================= tests
292
293#[cfg(test)]
294mod tests {
295    use super::*;
296
297    fn ctx() -> FormatContext {
298        FormatContext {
299            sizeof_addr: 8,
300            sizeof_size: 8,
301        }
302    }
303
304    fn ctx4() -> FormatContext {
305        FormatContext {
306            sizeof_addr: 4,
307            sizeof_size: 4,
308        }
309    }
310
311    #[test]
312    fn empty_collection_roundtrip() {
313        let coll = GlobalHeapCollection::new();
314        let encoded = coll.encode(&ctx());
315        let (decoded, consumed) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
316        assert_eq!(consumed, encoded.len());
317        assert_eq!(decoded, coll);
318        assert!(decoded.objects.is_empty());
319    }
320
321    #[test]
322    fn single_object_roundtrip() {
323        let mut coll = GlobalHeapCollection::new();
324        let idx = coll.add_object(b"hello".to_vec()).unwrap();
325        assert_eq!(idx, 1);
326
327        let encoded = coll.encode(&ctx());
328        let (decoded, consumed) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
329        assert_eq!(consumed, encoded.len());
330        assert_eq!(decoded.objects.len(), 1);
331        assert_eq!(decoded.objects[0].index, 1);
332        assert_eq!(decoded.objects[0].data, b"hello");
333    }
334
335    #[test]
336    fn multiple_objects_roundtrip() {
337        let mut coll = GlobalHeapCollection::new();
338        let i1 = coll.add_object(b"alpha".to_vec()).unwrap();
339        let i2 = coll.add_object(b"beta".to_vec()).unwrap();
340        let i3 = coll.add_object(b"gamma delta".to_vec()).unwrap();
341        assert_eq!(i1, 1);
342        assert_eq!(i2, 2);
343        assert_eq!(i3, 3);
344
345        let encoded = coll.encode(&ctx());
346        let (decoded, _) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
347        assert_eq!(decoded.objects.len(), 3);
348        assert_eq!(decoded.get_object(1), Some(b"alpha".as_slice()));
349        assert_eq!(decoded.get_object(2), Some(b"beta".as_slice()));
350        assert_eq!(decoded.get_object(3), Some(b"gamma delta".as_slice()));
351    }
352
353    #[test]
354    fn get_object_not_found() {
355        let coll = GlobalHeapCollection::new();
356        assert_eq!(coll.get_object(1), None);
357    }
358
359    #[test]
360    fn padding_to_8() {
361        assert_eq!(pad_to_8(0), 0);
362        assert_eq!(pad_to_8(1), 8);
363        assert_eq!(pad_to_8(7), 8);
364        assert_eq!(pad_to_8(8), 8);
365        assert_eq!(pad_to_8(9), 16);
366        assert_eq!(pad_to_8(16), 16);
367    }
368
369    #[test]
370    fn vlen_reference_roundtrip() {
371        let c = ctx();
372        let encoded = encode_vlen_reference(5, 0x1234_5678_9ABC_DEF0, 42, &c);
373        assert_eq!(encoded.len(), vlen_reference_size(&c));
374        let (seq_len, addr, idx) = decode_vlen_reference(&encoded, &c).unwrap();
375        assert_eq!(seq_len, 5);
376        assert_eq!(addr, 0x1234_5678_9ABC_DEF0);
377        assert_eq!(idx, 42);
378    }
379
380    #[test]
381    fn vlen_reference_4byte_roundtrip() {
382        let c = ctx4();
383        let encoded = encode_vlen_reference(10, 0x1234_5678, 7, &c);
384        assert_eq!(encoded.len(), 12); // 4 + 4 + 4
385        let (seq_len, addr, idx) = decode_vlen_reference(&encoded, &c).unwrap();
386        assert_eq!(seq_len, 10);
387        assert_eq!(addr, 0x1234_5678);
388        assert_eq!(idx, 7);
389    }
390
391    #[test]
392    fn vlen_reference_size_check() {
393        assert_eq!(vlen_reference_size(&ctx()), 16);
394        assert_eq!(vlen_reference_size(&ctx4()), 12);
395    }
396
397    #[test]
398    fn decode_bad_signature() {
399        let mut buf = vec![0u8; 32];
400        buf[0..4].copy_from_slice(b"XYZW");
401        let err = GlobalHeapCollection::decode(&buf, &ctx()).unwrap_err();
402        assert!(matches!(err, FormatError::InvalidSignature));
403    }
404
405    #[test]
406    fn decode_bad_version() {
407        let coll = GlobalHeapCollection::new();
408        let mut encoded = coll.encode(&ctx());
409        encoded[4] = 99;
410        let err = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap_err();
411        assert!(matches!(err, FormatError::InvalidVersion(99)));
412    }
413
414    #[test]
415    fn decode_buffer_too_short() {
416        let buf = [0u8; 4];
417        let err = GlobalHeapCollection::decode(&buf, &ctx()).unwrap_err();
418        assert!(matches!(err, FormatError::BufferTooShort { .. }));
419    }
420
421    #[test]
422    fn ctx4_roundtrip() {
423        let c = ctx4();
424        let mut coll = GlobalHeapCollection::new();
425        coll.add_object(b"test data".to_vec()).unwrap();
426        let encoded = coll.encode(&c);
427        let (decoded, consumed) = GlobalHeapCollection::decode(&encoded, &c).unwrap();
428        assert_eq!(consumed, encoded.len());
429        assert_eq!(decoded.get_object(1), Some(b"test data".as_slice()));
430    }
431
432    #[test]
433    fn object_data_alignment() {
434        // Verify that data of odd sizes still roundtrips correctly due to padding
435        let mut coll = GlobalHeapCollection::new();
436        coll.add_object(vec![1]).unwrap(); // 1 byte -> padded to 8
437        coll.add_object(vec![2, 3, 4, 5, 6, 7, 8, 9, 10]).unwrap(); // 9 bytes -> padded to 16
438        coll.add_object(vec![11, 12, 13, 14, 15, 16, 17, 18])
439            .unwrap(); // 8 bytes -> stays 8
440
441        let encoded = coll.encode(&ctx());
442        let (decoded, _) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
443        assert_eq!(decoded.get_object(1), Some([1u8].as_slice()));
444        assert_eq!(
445            decoded.get_object(2),
446            Some([2, 3, 4, 5, 6, 7, 8, 9, 10].as_slice())
447        );
448        assert_eq!(
449            decoded.get_object(3),
450            Some([11, 12, 13, 14, 15, 16, 17, 18].as_slice())
451        );
452    }
453
454    #[test]
455    fn empty_data_object() {
456        let mut coll = GlobalHeapCollection::new();
457        coll.add_object(vec![]).unwrap();
458        let encoded = coll.encode(&ctx());
459        let (decoded, _) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
460        assert_eq!(decoded.get_object(1), Some([].as_slice()));
461    }
462}