Skip to main content

rust_hdf5/format/
btree_v1.rs

1//! B-tree v1 decode (for reading legacy HDF5 files).
2//!
3//! The B-tree v1 is used in v0/v1 groups to index symbol table entries.
4//! For group B-trees (type 0), each key is a name offset into the local
5//! heap, and each child pointer is an address of either a SNOD (leaf
6//! level) or another TREE node (internal level).
7//!
8//! Layout:
9//! ```text
10//! "TREE" (4 bytes)
11//! type: 1 byte (0 = group)
12//! level: 1 byte (0 = leaf)
13//! entries_used: u16 LE
14//! left_sibling: sizeof_addr bytes LE
15//! right_sibling: sizeof_addr bytes LE
16//! Then interleaved keys and children:
17//!   key[0], child[0], key[1], child[1], ..., key[entries_used]
18//! ```
19//!
20//! For type-0 (group) B-trees:
21//! - Each key is sizeof_size bytes (name offset into local heap)
22//! - Each child is sizeof_addr bytes (address of SNOD or sub-TREE)
23//!
24//! For type-1 (raw data chunk) B-trees:
25//! - Each key is `4 + 4 + (rank+1)*8` bytes: chunk_size(4), filter_mask(4),
26//!   then (rank+1) 8-byte element offsets. The last offset is the
27//!   element-size dimension and is always 0.
28//! - Each child is sizeof_addr bytes: a chunk-data address at a leaf
29//!   node (level 0), or a sub-TREE address at an internal node.
30
31use crate::format::bytes::{read_le_addr as read_addr, read_le_uint as read_uint};
32use crate::format::{FormatError, FormatResult};
33
34/// The 4-byte B-tree v1 signature.
35pub const BTREE_V1_SIGNATURE: [u8; 4] = *b"TREE";
36
37/// A decoded B-tree v1 node.
38#[derive(Debug, Clone)]
39pub struct BTreeV1Node {
40    /// Node type: 0 = group, 1 = raw data chunk.
41    pub node_type: u8,
42    /// Node level: 0 = leaf (children are SNODs), >0 = internal (children are sub-TREE).
43    pub level: u8,
44    /// Number of entries used in this node.
45    pub entries_used: u16,
46    /// Address of left sibling, or UNDEF_ADDR if none.
47    pub left_sibling: u64,
48    /// Address of right sibling, or UNDEF_ADDR if none.
49    pub right_sibling: u64,
50    /// Keys (entries_used + 1 entries for type-0 group trees).
51    pub keys: Vec<u64>,
52    /// Child addresses (entries_used entries).
53    pub children: Vec<u64>,
54}
55
56impl BTreeV1Node {
57    /// Decode a B-tree v1 node from `buf`.
58    ///
59    /// `sizeof_addr` and `sizeof_size` come from the superblock.
60    pub fn decode(buf: &[u8], sizeof_addr: usize, sizeof_size: usize) -> FormatResult<Self> {
61        let header_size = 4 + 1 + 1 + 2 + sizeof_addr * 2;
62        if buf.len() < header_size {
63            return Err(FormatError::BufferTooShort {
64                needed: header_size,
65                available: buf.len(),
66            });
67        }
68
69        if buf[0..4] != BTREE_V1_SIGNATURE {
70            return Err(FormatError::InvalidSignature);
71        }
72
73        let node_type = buf[4];
74        let level = buf[5];
75        let entries_used = u16::from_le_bytes([buf[6], buf[7]]);
76
77        let mut pos = 8;
78        let left_sibling = read_addr(&buf[pos..], sizeof_addr);
79        pos += sizeof_addr;
80        let right_sibling = read_addr(&buf[pos..], sizeof_addr);
81        pos += sizeof_addr;
82
83        // For group B-trees (type 0):
84        // Interleaved: key[0], child[0], key[1], child[1], ..., key[n]
85        // That's (entries_used + 1) keys and entries_used children.
86        let n = entries_used as usize;
87
88        if node_type == 0 {
89            // Group B-tree
90            let key_size = sizeof_size;
91            let child_size = sizeof_addr;
92            // Total data: (n+1) keys interleaved with n children
93            let data_size = (n + 1) * key_size + n * child_size;
94            let needed = pos + data_size;
95            if buf.len() < needed {
96                return Err(FormatError::BufferTooShort {
97                    needed,
98                    available: buf.len(),
99                });
100            }
101
102            let mut keys = Vec::with_capacity(n + 1);
103            let mut children = Vec::with_capacity(n);
104
105            for _i in 0..n {
106                // key[i]
107                keys.push(read_uint(&buf[pos..], key_size));
108                pos += key_size;
109                // child[i]
110                children.push(read_uint(&buf[pos..], child_size));
111                pos += child_size;
112            }
113            // final key[n]
114            keys.push(read_uint(&buf[pos..], key_size));
115
116            Ok(BTreeV1Node {
117                node_type,
118                level,
119                entries_used,
120                left_sibling,
121                right_sibling,
122                keys,
123                children,
124            })
125        } else {
126            // Raw data chunk B-tree (type 1) is decoded via
127            // `ChunkBTreeV1Node::decode`, which understands the chunk-key
128            // structure. `BTreeV1Node` only models type-0 group trees.
129            Err(FormatError::UnsupportedFeature(format!(
130                "B-tree v1 type {} not supported by BTreeV1Node (use ChunkBTreeV1Node)",
131                node_type
132            )))
133        }
134    }
135}
136
137/// A decoded chunk key from a raw-data-chunk (type-1) B-tree v1 node.
138#[derive(Debug, Clone, PartialEq, Eq)]
139pub struct ChunkKey {
140    /// Size in bytes of the stored chunk (compressed size when filtered).
141    pub chunk_size: u32,
142    /// Filter mask: bit `i` set means filter `i` was skipped for this chunk.
143    pub filter_mask: u32,
144    /// Per-dimension element offsets of the chunk's first element. The
145    /// trailing entry is the element-size dimension and is always 0, so
146    /// this has `rank + 1` entries.
147    pub offsets: Vec<u64>,
148}
149
150/// A decoded raw-data-chunk (type-1) B-tree v1 node.
151#[derive(Debug, Clone)]
152pub struct ChunkBTreeV1Node {
153    /// Node level: 0 = leaf (children point at chunk data), >0 = internal
154    /// (children point at sub-TREE nodes).
155    pub level: u8,
156    /// Number of entries (children) used in this node.
157    pub entries_used: u16,
158    /// Keys, `entries_used + 1` of them. `keys[i]` describes `children[i]`;
159    /// the final key is the right-boundary key.
160    pub keys: Vec<ChunkKey>,
161    /// Child addresses, `entries_used` of them.
162    pub children: Vec<u64>,
163}
164
165impl ChunkBTreeV1Node {
166    /// Decode a type-1 (raw data chunk) B-tree v1 node from `buf`.
167    ///
168    /// `rank` is the chunk rank *excluding* the trailing element-size
169    /// dimension, so each key carries `rank + 1` 8-byte offsets — matching
170    /// libhdf5's `H5O_layout_chunk_t::ndims` (which includes the element
171    /// dimension).
172    pub fn decode(buf: &[u8], sizeof_addr: usize, rank: usize) -> FormatResult<Self> {
173        let header_size = 4 + 1 + 1 + 2 + sizeof_addr * 2;
174        if buf.len() < header_size {
175            return Err(FormatError::BufferTooShort {
176                needed: header_size,
177                available: buf.len(),
178            });
179        }
180
181        if buf[0..4] != BTREE_V1_SIGNATURE {
182            return Err(FormatError::InvalidSignature);
183        }
184
185        let node_type = buf[4];
186        if node_type != 1 {
187            return Err(FormatError::UnsupportedFeature(format!(
188                "expected B-tree v1 chunk node (type 1), found type {node_type}"
189            )));
190        }
191        let level = buf[5];
192        let entries_used = u16::from_le_bytes([buf[6], buf[7]]);
193
194        // Skip the signature/type/level/entries header plus the two
195        // sibling pointers.
196        let mut pos = 8 + sizeof_addr * 2;
197
198        let n = entries_used as usize;
199        // Each chunk key: chunk_size(4) + filter_mask(4) + (rank+1)*8.
200        let key_size = 4 + 4 + (rank + 1) * 8;
201        // Interleaved: key[0] child[0] ... key[n-1] child[n-1] key[n].
202        let data_size = (n + 1) * key_size + n * sizeof_addr;
203        let needed = pos + data_size;
204        if buf.len() < needed {
205            return Err(FormatError::BufferTooShort {
206                needed,
207                available: buf.len(),
208            });
209        }
210
211        let decode_key = |slice: &[u8]| -> ChunkKey {
212            let chunk_size = u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]);
213            let filter_mask = u32::from_le_bytes([slice[4], slice[5], slice[6], slice[7]]);
214            let mut offsets = Vec::with_capacity(rank + 1);
215            let mut o = 8;
216            for _ in 0..(rank + 1) {
217                offsets.push(read_uint(&slice[o..], 8));
218                o += 8;
219            }
220            ChunkKey {
221                chunk_size,
222                filter_mask,
223                offsets,
224            }
225        };
226
227        let mut keys = Vec::with_capacity(n + 1);
228        let mut children = Vec::with_capacity(n);
229        for _ in 0..n {
230            keys.push(decode_key(&buf[pos..pos + key_size]));
231            pos += key_size;
232            children.push(read_addr(&buf[pos..], sizeof_addr));
233            pos += sizeof_addr;
234        }
235        // Final right-boundary key.
236        keys.push(decode_key(&buf[pos..pos + key_size]));
237
238        Ok(ChunkBTreeV1Node {
239            level,
240            entries_used,
241            keys,
242            children,
243        })
244    }
245}
246
247// ======================================================================= tests
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252    use crate::format::UNDEF_ADDR;
253
254    /// Build a group B-tree v1 node for testing.
255    fn build_group_btree(
256        level: u8,
257        keys: &[u64],
258        children: &[u64],
259        sizeof_addr: usize,
260        sizeof_size: usize,
261    ) -> Vec<u8> {
262        assert_eq!(keys.len(), children.len() + 1);
263        let entries_used = children.len() as u16;
264
265        let mut buf = Vec::new();
266        buf.extend_from_slice(&BTREE_V1_SIGNATURE);
267        buf.push(0); // type = group
268        buf.push(level);
269        buf.extend_from_slice(&entries_used.to_le_bytes());
270        // left sibling = UNDEF
271        buf.extend_from_slice(&UNDEF_ADDR.to_le_bytes()[..sizeof_addr]);
272        // right sibling = UNDEF
273        buf.extend_from_slice(&UNDEF_ADDR.to_le_bytes()[..sizeof_addr]);
274
275        // Interleaved keys and children
276        for i in 0..children.len() {
277            buf.extend_from_slice(&keys[i].to_le_bytes()[..sizeof_size]);
278            buf.extend_from_slice(&children[i].to_le_bytes()[..sizeof_addr]);
279        }
280        // Final key
281        buf.extend_from_slice(&keys[children.len()].to_le_bytes()[..sizeof_size]);
282
283        buf
284    }
285
286    #[test]
287    fn decode_leaf_node() {
288        let buf = build_group_btree(
289            0,               // leaf
290            &[0, 8, 16],     // 3 keys
291            &[0x100, 0x200], // 2 children (SNOD addresses)
292            8,
293            8,
294        );
295        let node = BTreeV1Node::decode(&buf, 8, 8).unwrap();
296        assert_eq!(node.node_type, 0);
297        assert_eq!(node.level, 0);
298        assert_eq!(node.entries_used, 2);
299        assert_eq!(node.keys, vec![0, 8, 16]);
300        assert_eq!(node.children, vec![0x100, 0x200]);
301        assert_eq!(node.left_sibling, UNDEF_ADDR);
302        assert_eq!(node.right_sibling, UNDEF_ADDR);
303    }
304
305    #[test]
306    fn decode_internal_node() {
307        let buf = build_group_btree(
308            1,         // internal
309            &[0, 100], // 2 keys
310            &[0x500],  // 1 child (sub-TREE address)
311            8,
312            8,
313        );
314        let node = BTreeV1Node::decode(&buf, 8, 8).unwrap();
315        assert_eq!(node.level, 1);
316        assert_eq!(node.entries_used, 1);
317        assert_eq!(node.children, vec![0x500]);
318    }
319
320    #[test]
321    fn decode_single_entry() {
322        let buf = build_group_btree(0, &[0, 8], &[0x100], 8, 8);
323        let node = BTreeV1Node::decode(&buf, 8, 8).unwrap();
324        assert_eq!(node.entries_used, 1);
325        assert_eq!(node.children.len(), 1);
326    }
327
328    #[test]
329    fn decode_4byte() {
330        let buf = build_group_btree(0, &[0, 4], &[0x80], 4, 4);
331        let node = BTreeV1Node::decode(&buf, 4, 4).unwrap();
332        assert_eq!(node.entries_used, 1);
333        assert_eq!(node.children, vec![0x80]);
334    }
335
336    #[test]
337    fn decode_bad_sig() {
338        let mut buf = build_group_btree(0, &[0, 8], &[0x100], 8, 8);
339        buf[0] = b'X';
340        assert!(matches!(
341            BTreeV1Node::decode(&buf, 8, 8).unwrap_err(),
342            FormatError::InvalidSignature
343        ));
344    }
345
346    #[test]
347    fn decode_too_short() {
348        assert!(matches!(
349            BTreeV1Node::decode(&[0u8; 4], 8, 8).unwrap_err(),
350            FormatError::BufferTooShort { .. }
351        ));
352    }
353
354    #[test]
355    fn decode_unsupported_type() {
356        let mut buf = build_group_btree(0, &[0, 8], &[0x100], 8, 8);
357        buf[4] = 1; // type = raw data chunks
358        assert!(matches!(
359            BTreeV1Node::decode(&buf, 8, 8).unwrap_err(),
360            FormatError::UnsupportedFeature(_)
361        ));
362    }
363
364    /// Build a type-1 (raw data chunk) B-tree v1 node for testing.
365    /// `rank` excludes the trailing element-size dimension.
366    fn build_chunk_btree(
367        level: u8,
368        keys: &[ChunkKey],
369        children: &[u64],
370        sizeof_addr: usize,
371    ) -> Vec<u8> {
372        assert_eq!(keys.len(), children.len() + 1);
373        let entries_used = children.len() as u16;
374
375        let mut buf = Vec::new();
376        buf.extend_from_slice(&BTREE_V1_SIGNATURE);
377        buf.push(1); // type = raw data chunk
378        buf.push(level);
379        buf.extend_from_slice(&entries_used.to_le_bytes());
380        buf.extend_from_slice(&UNDEF_ADDR.to_le_bytes()[..sizeof_addr]); // left
381        buf.extend_from_slice(&UNDEF_ADDR.to_le_bytes()[..sizeof_addr]); // right
382
383        let encode_key = |buf: &mut Vec<u8>, k: &ChunkKey| {
384            buf.extend_from_slice(&k.chunk_size.to_le_bytes());
385            buf.extend_from_slice(&k.filter_mask.to_le_bytes());
386            for &o in &k.offsets {
387                buf.extend_from_slice(&o.to_le_bytes());
388            }
389        };
390
391        for i in 0..children.len() {
392            encode_key(&mut buf, &keys[i]);
393            buf.extend_from_slice(&children[i].to_le_bytes()[..sizeof_addr]);
394        }
395        encode_key(&mut buf, &keys[children.len()]);
396        buf
397    }
398
399    fn chunk_key(size: u32, mask: u32, offsets: &[u64]) -> ChunkKey {
400        ChunkKey {
401            chunk_size: size,
402            filter_mask: mask,
403            offsets: offsets.to_vec(),
404        }
405    }
406
407    #[test]
408    fn decode_chunk_leaf_1d() {
409        // 1-D dataset (rank 1): each key has rank+1 = 2 offsets.
410        let keys = [
411            chunk_key(32, 0, &[0, 0]),
412            chunk_key(32, 0, &[8, 0]),
413            chunk_key(0, 0, &[16, 0]),
414        ];
415        let buf = build_chunk_btree(0, &keys, &[0x400, 0x800], 8);
416        let node = ChunkBTreeV1Node::decode(&buf, 8, 1).unwrap();
417        assert_eq!(node.level, 0);
418        assert_eq!(node.entries_used, 2);
419        assert_eq!(node.children, vec![0x400, 0x800]);
420        assert_eq!(node.keys.len(), 3);
421        assert_eq!(node.keys[0].chunk_size, 32);
422        assert_eq!(node.keys[1].offsets, vec![8, 0]);
423    }
424
425    #[test]
426    fn decode_chunk_internal_2d() {
427        // 2-D dataset (rank 2): each key has rank+1 = 3 offsets.
428        let keys = [chunk_key(64, 0, &[0, 0, 0]), chunk_key(64, 0, &[4, 4, 0])];
429        let buf = build_chunk_btree(1, &keys, &[0x1000], 8);
430        let node = ChunkBTreeV1Node::decode(&buf, 8, 2).unwrap();
431        assert_eq!(node.level, 1);
432        assert_eq!(node.entries_used, 1);
433        assert_eq!(node.children, vec![0x1000]);
434        assert_eq!(node.keys[0].offsets, vec![0, 0, 0]);
435    }
436
437    #[test]
438    fn decode_chunk_filtered_key() {
439        let keys = [chunk_key(17, 0x1, &[0, 0]), chunk_key(0, 0, &[8, 0])];
440        let buf = build_chunk_btree(0, &keys, &[0x200], 8);
441        let node = ChunkBTreeV1Node::decode(&buf, 8, 1).unwrap();
442        assert_eq!(node.keys[0].chunk_size, 17);
443        assert_eq!(node.keys[0].filter_mask, 0x1);
444    }
445
446    #[test]
447    fn decode_chunk_rejects_group_node() {
448        let buf = build_group_btree(0, &[0, 8], &[0x100], 8, 8);
449        assert!(matches!(
450            ChunkBTreeV1Node::decode(&buf, 8, 1).unwrap_err(),
451            FormatError::UnsupportedFeature(_)
452        ));
453    }
454
455    #[test]
456    fn decode_chunk_too_short() {
457        assert!(matches!(
458            ChunkBTreeV1Node::decode(&[0u8; 4], 8, 1).unwrap_err(),
459            FormatError::BufferTooShort { .. }
460        ));
461    }
462
463    #[test]
464    fn decode_chunk_bad_sig() {
465        let keys = [chunk_key(8, 0, &[0, 0]), chunk_key(0, 0, &[8, 0])];
466        let mut buf = build_chunk_btree(0, &keys, &[0x100], 8);
467        buf[0] = b'X';
468        assert!(matches!(
469            ChunkBTreeV1Node::decode(&buf, 8, 1).unwrap_err(),
470            FormatError::InvalidSignature
471        ));
472    }
473
474    #[test]
475    fn decode_chunk_4byte_addr() {
476        let keys = [chunk_key(16, 0, &[0, 0]), chunk_key(0, 0, &[4, 0])];
477        let buf = build_chunk_btree(0, &keys, &[0x80], 4);
478        let node = ChunkBTreeV1Node::decode(&buf, 4, 1).unwrap();
479        assert_eq!(node.children, vec![0x80]);
480    }
481}