Skip to main content

rust_hdf5/format/
global_heap.rs

1//! Global Heap Collection (GCOL) -- stores variable-length data such as
2//! variable-length strings.
3//!
4//! Binary layout of a Global Heap Collection:
5//! ```text
6//! "GCOL"              (4 bytes, signature)
7//! version             (1 byte, must be 1)
8//! reserved            (3 bytes)
9//! collection_size     (sizeof_size bytes LE, total including header)
10//!
11//! Followed by heap objects:
12//!   index             (u16 LE, 0 = free space / end marker, 1+ = object)
13//!   ref_count          (u16 LE)
14//!   reserved           (u32 LE)
15//!   size               (sizeof_size bytes LE)
16//!   data               (size bytes, padded to 8-byte alignment)
17//! ```
18//!
19//! A variable-length reference stored in dataset raw data is:
20//! ```text
21//! sequence_length     (u32 LE, length of the vlen sequence)
22//! collection_address  (sizeof_addr bytes LE, address of the GCOL)
23//! object_index        (u32 LE, index within the collection)
24//! ```
25//! Total vlen reference size = 4 + sizeof_addr + 4 bytes.
26
27use crate::format::{FormatContext, FormatError, FormatResult};
28
29/// Signature for a global heap collection.
30const GCOL_SIGNATURE: [u8; 4] = *b"GCOL";
31
32/// Global heap collection version.
33const GCOL_VERSION: u8 = 1;
34
35/// Minimum collection size required by the HDF5 C library (H5HG_MINALLOC).
36const GCOL_MIN_SIZE: usize = 4096;
37
38/// A single object within a global heap collection.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct GlobalHeapObject {
41    /// Object index (1-based). Index 0 is reserved for the free-space marker.
42    pub index: u16,
43    /// Raw data stored in this object.
44    pub data: Vec<u8>,
45}
46
47/// A global heap collection, containing a set of heap objects.
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub struct GlobalHeapCollection {
50    /// The heap objects in this collection (index > 0).
51    pub objects: Vec<GlobalHeapObject>,
52}
53
54impl GlobalHeapCollection {
55    /// Create an empty global heap collection.
56    pub fn new() -> Self {
57        Self {
58            objects: Vec::new(),
59        }
60    }
61
62    /// Add a data blob to the collection. Returns the 1-based object index.
63    pub fn add_object(&mut self, data: Vec<u8>) -> u16 {
64        let index = if self.objects.is_empty() {
65            1
66        } else {
67            self.objects.iter().map(|o| o.index).max().unwrap_or(0) + 1
68        };
69        self.objects.push(GlobalHeapObject { index, data });
70        index
71    }
72
73    /// Retrieve the data for an object by its 1-based index.
74    pub fn get_object(&self, index: u16) -> Option<&[u8]> {
75        self.objects
76            .iter()
77            .find(|o| o.index == index)
78            .map(|o| o.data.as_slice())
79    }
80
81    /// Encode the collection into a byte vector.
82    ///
83    /// The encoded blob includes the GCOL header and all heap objects,
84    /// followed by a free-space marker (index=0 object).
85    /// The total size is padded to at least 4096 bytes (H5HG_MINALLOC)
86    /// for compatibility with the HDF5 C library.
87    pub fn encode(&self, ctx: &FormatContext) -> Vec<u8> {
88        let ss = ctx.sizeof_size as usize;
89
90        // Compute body size: sum of all object encodings + free-space marker
91        // Each object: 2 (index) + 2 (ref_count) + 4 (reserved) + ss (size) + padded_data
92        let header_size = 4 + 1 + 3 + ss; // GCOL + version + reserved + collection_size
93        let mut objects_size: usize = 0;
94        for obj in &self.objects {
95            let padded = pad_to_8(obj.data.len());
96            objects_size += 2 + 2 + 4 + ss + padded;
97        }
98        // Free-space marker: index(2) + ref_count(2) + reserved(4) + size(ss) = 8 + ss
99        let free_marker_size = 2 + 2 + 4 + ss;
100        let content_size = header_size + objects_size + free_marker_size;
101
102        // HDF5 C library requires collection_size >= 4096 (H5HG_MINALLOC)
103        let collection_size = content_size.max(GCOL_MIN_SIZE);
104        // HDF5 convention: free marker size = collection_size - header - objects
105        // (includes the free marker's own header in the "free space")
106        let free_space = collection_size - header_size - objects_size;
107
108        let mut buf = Vec::with_capacity(collection_size);
109
110        // Header
111        buf.extend_from_slice(&GCOL_SIGNATURE);
112        buf.push(GCOL_VERSION);
113        buf.extend_from_slice(&[0u8; 3]); // reserved
114        buf.extend_from_slice(&(collection_size as u64).to_le_bytes()[..ss]);
115
116        // Objects
117        for obj in &self.objects {
118            buf.extend_from_slice(&obj.index.to_le_bytes());
119            buf.extend_from_slice(&1u16.to_le_bytes()); // ref_count = 1
120            buf.extend_from_slice(&0u32.to_le_bytes()); // reserved
121            buf.extend_from_slice(&(obj.data.len() as u64).to_le_bytes()[..ss]);
122            buf.extend_from_slice(&obj.data);
123            // Pad to 8-byte alignment
124            let pad = pad_to_8(obj.data.len()) - obj.data.len();
125            if pad > 0 {
126                buf.extend_from_slice(&vec![0u8; pad]);
127            }
128        }
129
130        // Free-space marker (index = 0) with remaining space
131        buf.extend_from_slice(&0u16.to_le_bytes()); // index = 0
132        buf.extend_from_slice(&0u16.to_le_bytes()); // ref_count = 0
133        buf.extend_from_slice(&0u32.to_le_bytes()); // reserved
134        buf.extend_from_slice(&(free_space as u64).to_le_bytes()[..ss]); // free space size
135
136        // Zero-fill remaining space
137        buf.resize(collection_size, 0);
138
139        debug_assert_eq!(buf.len(), collection_size);
140        buf
141    }
142
143    /// Decode a global heap collection from a byte buffer.
144    ///
145    /// Returns the collection and the number of bytes consumed.
146    pub fn decode(buf: &[u8], ctx: &FormatContext) -> FormatResult<(Self, usize)> {
147        let ss = ctx.sizeof_size as usize;
148        let header_size = 4 + 1 + 3 + ss;
149
150        if buf.len() < header_size {
151            return Err(FormatError::BufferTooShort {
152                needed: header_size,
153                available: buf.len(),
154            });
155        }
156
157        // Signature
158        if buf[0..4] != GCOL_SIGNATURE {
159            return Err(FormatError::InvalidSignature);
160        }
161
162        // Version
163        let version = buf[4];
164        if version != GCOL_VERSION {
165            return Err(FormatError::InvalidVersion(version));
166        }
167
168        // Reserved (bytes 5..8) -- skip
169
170        // Collection size
171        let collection_size = read_size(&buf[8..], ss) as usize;
172
173        if buf.len() < collection_size {
174            return Err(FormatError::BufferTooShort {
175                needed: collection_size,
176                available: buf.len(),
177            });
178        }
179
180        // Parse objects
181        let mut pos = header_size;
182        let mut objects = Vec::new();
183
184        while pos + 2 + 2 + 4 + ss <= collection_size {
185            let index = u16::from_le_bytes([buf[pos], buf[pos + 1]]);
186            pos += 2;
187            let _ref_count = u16::from_le_bytes([buf[pos], buf[pos + 1]]);
188            pos += 2;
189            let _reserved =
190                u32::from_le_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]);
191            pos += 4;
192            let size = read_size(&buf[pos..], ss) as usize;
193            pos += ss;
194
195            if index == 0 {
196                // Free-space marker -- end of used objects
197                break;
198            }
199
200            if pos + size > collection_size {
201                return Err(FormatError::InvalidData(format!(
202                    "global heap object {} extends past collection boundary",
203                    index,
204                )));
205            }
206
207            let data = buf[pos..pos + size].to_vec();
208            let padded = pad_to_8(size);
209            pos += padded;
210
211            objects.push(GlobalHeapObject { index, data });
212        }
213
214        Ok((Self { objects }, collection_size))
215    }
216}
217
218impl Default for GlobalHeapCollection {
219    fn default() -> Self {
220        Self::new()
221    }
222}
223
224/// Encode a variable-length reference (used in dataset raw data).
225///
226/// On-disk format per element:
227///   sequence_length (u32 LE) + collection_address (sizeof_addr bytes) + object_index (u32 LE).
228pub fn encode_vlen_reference(
229    sequence_length: u32,
230    collection_addr: u64,
231    object_index: u32,
232    ctx: &FormatContext,
233) -> Vec<u8> {
234    let sa = ctx.sizeof_addr as usize;
235    let mut buf = Vec::with_capacity(4 + sa + 4);
236    buf.extend_from_slice(&sequence_length.to_le_bytes());
237    buf.extend_from_slice(&collection_addr.to_le_bytes()[..sa]);
238    buf.extend_from_slice(&object_index.to_le_bytes());
239    buf
240}
241
242/// Decode a variable-length reference from dataset raw data.
243///
244/// Returns `(sequence_length, collection_address, object_index)`.
245pub fn decode_vlen_reference(buf: &[u8], ctx: &FormatContext) -> FormatResult<(u32, u64, u32)> {
246    let sa = ctx.sizeof_addr as usize;
247    let total = 4 + sa + 4;
248    if buf.len() < total {
249        return Err(FormatError::BufferTooShort {
250            needed: total,
251            available: buf.len(),
252        });
253    }
254    let seq_len = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]);
255    let addr = read_size(&buf[4..], sa);
256    let index = u32::from_le_bytes([
257        buf[4 + sa],
258        buf[4 + sa + 1],
259        buf[4 + sa + 2],
260        buf[4 + sa + 3],
261    ]);
262    Ok((seq_len, addr, index))
263}
264
265/// Return the size of a vlen reference in bytes: 4 + sizeof_addr + 4.
266pub fn vlen_reference_size(ctx: &FormatContext) -> usize {
267    4 + ctx.sizeof_addr as usize + 4
268}
269
270/// Round `n` up to the next multiple of 8.
271fn pad_to_8(n: usize) -> usize {
272    (n + 7) & !7
273}
274
275/// Read a little-endian unsigned integer of `n` bytes (1..=8) into a `u64`.
276fn read_size(buf: &[u8], n: usize) -> u64 {
277    let mut tmp = [0u8; 8];
278    tmp[..n].copy_from_slice(&buf[..n]);
279    u64::from_le_bytes(tmp)
280}
281
282// ======================================================================= tests
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    fn ctx() -> FormatContext {
289        FormatContext {
290            sizeof_addr: 8,
291            sizeof_size: 8,
292        }
293    }
294
295    fn ctx4() -> FormatContext {
296        FormatContext {
297            sizeof_addr: 4,
298            sizeof_size: 4,
299        }
300    }
301
302    #[test]
303    fn empty_collection_roundtrip() {
304        let coll = GlobalHeapCollection::new();
305        let encoded = coll.encode(&ctx());
306        let (decoded, consumed) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
307        assert_eq!(consumed, encoded.len());
308        assert_eq!(decoded, coll);
309        assert!(decoded.objects.is_empty());
310    }
311
312    #[test]
313    fn single_object_roundtrip() {
314        let mut coll = GlobalHeapCollection::new();
315        let idx = coll.add_object(b"hello".to_vec());
316        assert_eq!(idx, 1);
317
318        let encoded = coll.encode(&ctx());
319        let (decoded, consumed) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
320        assert_eq!(consumed, encoded.len());
321        assert_eq!(decoded.objects.len(), 1);
322        assert_eq!(decoded.objects[0].index, 1);
323        assert_eq!(decoded.objects[0].data, b"hello");
324    }
325
326    #[test]
327    fn multiple_objects_roundtrip() {
328        let mut coll = GlobalHeapCollection::new();
329        let i1 = coll.add_object(b"alpha".to_vec());
330        let i2 = coll.add_object(b"beta".to_vec());
331        let i3 = coll.add_object(b"gamma delta".to_vec());
332        assert_eq!(i1, 1);
333        assert_eq!(i2, 2);
334        assert_eq!(i3, 3);
335
336        let encoded = coll.encode(&ctx());
337        let (decoded, _) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
338        assert_eq!(decoded.objects.len(), 3);
339        assert_eq!(decoded.get_object(1), Some(b"alpha".as_slice()));
340        assert_eq!(decoded.get_object(2), Some(b"beta".as_slice()));
341        assert_eq!(decoded.get_object(3), Some(b"gamma delta".as_slice()));
342    }
343
344    #[test]
345    fn get_object_not_found() {
346        let coll = GlobalHeapCollection::new();
347        assert_eq!(coll.get_object(1), None);
348    }
349
350    #[test]
351    fn padding_to_8() {
352        assert_eq!(pad_to_8(0), 0);
353        assert_eq!(pad_to_8(1), 8);
354        assert_eq!(pad_to_8(7), 8);
355        assert_eq!(pad_to_8(8), 8);
356        assert_eq!(pad_to_8(9), 16);
357        assert_eq!(pad_to_8(16), 16);
358    }
359
360    #[test]
361    fn vlen_reference_roundtrip() {
362        let c = ctx();
363        let encoded = encode_vlen_reference(5, 0x1234_5678_9ABC_DEF0, 42, &c);
364        assert_eq!(encoded.len(), vlen_reference_size(&c));
365        let (seq_len, addr, idx) = decode_vlen_reference(&encoded, &c).unwrap();
366        assert_eq!(seq_len, 5);
367        assert_eq!(addr, 0x1234_5678_9ABC_DEF0);
368        assert_eq!(idx, 42);
369    }
370
371    #[test]
372    fn vlen_reference_4byte_roundtrip() {
373        let c = ctx4();
374        let encoded = encode_vlen_reference(10, 0x1234_5678, 7, &c);
375        assert_eq!(encoded.len(), 12); // 4 + 4 + 4
376        let (seq_len, addr, idx) = decode_vlen_reference(&encoded, &c).unwrap();
377        assert_eq!(seq_len, 10);
378        assert_eq!(addr, 0x1234_5678);
379        assert_eq!(idx, 7);
380    }
381
382    #[test]
383    fn vlen_reference_size_check() {
384        assert_eq!(vlen_reference_size(&ctx()), 16);
385        assert_eq!(vlen_reference_size(&ctx4()), 12);
386    }
387
388    #[test]
389    fn decode_bad_signature() {
390        let mut buf = vec![0u8; 32];
391        buf[0..4].copy_from_slice(b"XYZW");
392        let err = GlobalHeapCollection::decode(&buf, &ctx()).unwrap_err();
393        assert!(matches!(err, FormatError::InvalidSignature));
394    }
395
396    #[test]
397    fn decode_bad_version() {
398        let coll = GlobalHeapCollection::new();
399        let mut encoded = coll.encode(&ctx());
400        encoded[4] = 99;
401        let err = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap_err();
402        assert!(matches!(err, FormatError::InvalidVersion(99)));
403    }
404
405    #[test]
406    fn decode_buffer_too_short() {
407        let buf = [0u8; 4];
408        let err = GlobalHeapCollection::decode(&buf, &ctx()).unwrap_err();
409        assert!(matches!(err, FormatError::BufferTooShort { .. }));
410    }
411
412    #[test]
413    fn ctx4_roundtrip() {
414        let c = ctx4();
415        let mut coll = GlobalHeapCollection::new();
416        coll.add_object(b"test data".to_vec());
417        let encoded = coll.encode(&c);
418        let (decoded, consumed) = GlobalHeapCollection::decode(&encoded, &c).unwrap();
419        assert_eq!(consumed, encoded.len());
420        assert_eq!(decoded.get_object(1), Some(b"test data".as_slice()));
421    }
422
423    #[test]
424    fn object_data_alignment() {
425        // Verify that data of odd sizes still roundtrips correctly due to padding
426        let mut coll = GlobalHeapCollection::new();
427        coll.add_object(vec![1]); // 1 byte -> padded to 8
428        coll.add_object(vec![2, 3, 4, 5, 6, 7, 8, 9, 10]); // 9 bytes -> padded to 16
429        coll.add_object(vec![11, 12, 13, 14, 15, 16, 17, 18]); // 8 bytes -> stays 8
430
431        let encoded = coll.encode(&ctx());
432        let (decoded, _) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
433        assert_eq!(decoded.get_object(1), Some([1u8].as_slice()));
434        assert_eq!(
435            decoded.get_object(2),
436            Some([2, 3, 4, 5, 6, 7, 8, 9, 10].as_slice())
437        );
438        assert_eq!(
439            decoded.get_object(3),
440            Some([11, 12, 13, 14, 15, 16, 17, 18].as_slice())
441        );
442    }
443
444    #[test]
445    fn empty_data_object() {
446        let mut coll = GlobalHeapCollection::new();
447        coll.add_object(vec![]);
448        let encoded = coll.encode(&ctx());
449        let (decoded, _) = GlobalHeapCollection::decode(&encoded, &ctx()).unwrap();
450        assert_eq!(decoded.get_object(1), Some([].as_slice()));
451    }
452}