Skip to main content

hdf5_reader/
btree_v1.rs

1//! HDF5 B-link Tree Version 1.
2//!
3//! V1 B-trees are used by old-style (v0/v1) groups for link name lookup
4//! (type 0) and by chunked datasets for raw data chunk indexing (type 1).
5//!
6//! The tree is a B-link tree: each node carries left and right sibling
7//! pointers. Keys bracket children — a node with N children has N+1 keys.
8//! Leaf nodes (level 0) point to either symbol table nodes (SNODs) for
9//! type 0 or raw data chunk addresses for type 1. Internal nodes point to
10//! child B-tree nodes.
11
12use crate::error::{Error, Result};
13use crate::io::Cursor;
14
15/// Signature bytes for a v1 B-tree node: ASCII `TREE`.
16const BTREE_V1_SIGNATURE: [u8; 4] = *b"TREE";
17
18/// A key within a v1 B-tree node.
19#[derive(Debug, Clone)]
20pub enum BTreeV1Key {
21    /// Type 0 (group) key: offset into the local heap for the link name.
22    Group { local_heap_offset: u64 },
23    /// Type 1 (raw data chunk) key: chunk size, filter mask, and per-dimension
24    /// offsets (including one extra for the dataset element offset).
25    RawData {
26        chunk_size: u32,
27        filter_mask: u32,
28        offsets: Vec<u64>,
29    },
30}
31
32/// A parsed v1 B-tree node.
33#[derive(Debug, Clone)]
34pub struct BTreeV1Node {
35    /// Node type: 0 = group, 1 = raw data chunk.
36    pub node_type: u8,
37    /// Node level: 0 = leaf, > 0 = internal.
38    pub level: u8,
39    /// Number of entries (children) actually in use.
40    pub entries_used: u16,
41    /// Address of the left sibling node (undefined if none).
42    pub left_sibling: u64,
43    /// Address of the right sibling node (undefined if none).
44    pub right_sibling: u64,
45    /// Keys bracketing the children. Length is `entries_used + 1`.
46    pub keys: Vec<BTreeV1Key>,
47    /// Child addresses. Length is `entries_used`.
48    pub children: Vec<u64>,
49}
50
51impl BTreeV1Node {
52    /// Parse a v1 B-tree node at the current cursor position.
53    ///
54    /// `ndims` is required for type-1 (raw data chunk) nodes — it is the
55    /// number of dimensions of the dataset's dataspace. For type-0 (group)
56    /// nodes pass `None`.
57    ///
58    /// Format:
59    /// - Signature: `TREE` (4 bytes)
60    /// - Node type (u8): 0 = group, 1 = raw data
61    /// - Node level (u8)
62    /// - Entries used (u16 LE)
63    /// - Left sibling address (`offset_size` bytes)
64    /// - Right sibling address (`offset_size` bytes)
65    /// - Then interleaved keys and child pointers:
66    ///   key[0], child[0], key[1], child[1], ..., key[K-1], child[K-1], key[K]
67    ///   where K = entries_used.
68    pub fn parse(
69        cursor: &mut Cursor,
70        offset_size: u8,
71        length_size: u8,
72        ndims: Option<u32>,
73    ) -> Result<Self> {
74        let sig = cursor.read_bytes(4)?;
75        if sig != BTREE_V1_SIGNATURE {
76            return Err(Error::InvalidBTreeSignature);
77        }
78
79        let node_type = cursor.read_u8()?;
80        let level = cursor.read_u8()?;
81        let entries_used = cursor.read_u16_le()?;
82        let left_sibling = cursor.read_offset(offset_size)?;
83        let right_sibling = cursor.read_offset(offset_size)?;
84
85        let num_keys = entries_used as usize + 1;
86        let num_children = entries_used as usize;
87
88        let mut keys = Vec::with_capacity(num_keys);
89        let mut children = Vec::with_capacity(num_children);
90
91        // Read interleaved keys and children:
92        // key[0], child[0], key[1], child[1], ..., key[K-1], child[K-1], key[K]
93        for i in 0..num_keys {
94            let key = match node_type {
95                0 => parse_group_key(cursor, length_size)?,
96                1 => parse_raw_data_key(cursor, offset_size, ndims)?,
97                _ => {
98                    return Err(Error::InvalidData(format!(
99                        "unknown v1 B-tree node type: {node_type}"
100                    )));
101                }
102            };
103            keys.push(key);
104
105            // Read child address after every key except the last.
106            if i < num_children {
107                let child_addr = cursor.read_offset(offset_size)?;
108                children.push(child_addr);
109            }
110        }
111
112        Ok(BTreeV1Node {
113            node_type,
114            level,
115            entries_used,
116            left_sibling,
117            right_sibling,
118            keys,
119            children,
120        })
121    }
122}
123
124/// Parse a type-0 (group) key: just a length-sized offset into the local heap.
125fn parse_group_key(cursor: &mut Cursor, length_size: u8) -> Result<BTreeV1Key> {
126    let local_heap_offset = cursor.read_length(length_size)?;
127    Ok(BTreeV1Key::Group { local_heap_offset })
128}
129
130/// Parse a type-1 (raw data chunk) key.
131///
132/// Format:
133/// - chunk_size (u32 LE) — size of the chunk in bytes after filters
134/// - filter_mask (u32 LE) — bit mask of filters to skip
135/// - (ndims + 1) offsets, each `offset_size` bytes — chunk offsets per dimension
136///   plus an extra trailing offset (dataset element offset, typically 0)
137fn parse_raw_data_key(
138    cursor: &mut Cursor,
139    offset_size: u8,
140    ndims: Option<u32>,
141) -> Result<BTreeV1Key> {
142    let nd = ndims.ok_or_else(|| {
143        Error::InvalidData("ndims required for raw data chunk B-tree keys".into())
144    })?;
145
146    let chunk_size = cursor.read_u32_le()?;
147    let filter_mask = cursor.read_u32_le()?;
148
149    let num_offsets = nd as usize + 1;
150    let mut offsets = Vec::with_capacity(num_offsets);
151    for _ in 0..num_offsets {
152        offsets.push(cursor.read_offset(offset_size)?);
153    }
154
155    Ok(BTreeV1Key::RawData {
156        chunk_size,
157        filter_mask,
158        offsets,
159    })
160}
161
162/// Walk a v1 B-tree and collect all (key, child_address) pairs from leaf nodes.
163///
164/// For leaf nodes (level 0), this returns one entry per child paired with the
165/// key that precedes it (key[i] for child[i]). For internal nodes, this
166/// recurses into each child node.
167///
168/// `data` must be the full file data (or at least the region containing the
169/// B-tree nodes). `root_address` is the file offset of the root node.
170pub fn collect_btree_v1_leaves(
171    data: &[u8],
172    root_address: u64,
173    offset_size: u8,
174    length_size: u8,
175    ndims: Option<u32>,
176    chunk_dims: &[u32],
177    chunk_bounds: Option<(&[u64], &[u64])>,
178) -> Result<Vec<(BTreeV1Key, u64)>> {
179    let mut results = Vec::new();
180    collect_recursive(
181        BTreeV1CollectContext {
182            data,
183            offset_size,
184            length_size,
185            ndims,
186            chunk_dims,
187            chunk_bounds,
188        },
189        root_address,
190        &mut results,
191    )?;
192    Ok(results)
193}
194
195#[derive(Clone, Copy)]
196struct BTreeV1CollectContext<'a> {
197    data: &'a [u8],
198    offset_size: u8,
199    length_size: u8,
200    ndims: Option<u32>,
201    chunk_dims: &'a [u32],
202    chunk_bounds: Option<(&'a [u64], &'a [u64])>,
203}
204
205/// Recursive helper for tree traversal.
206fn collect_recursive(
207    context: BTreeV1CollectContext<'_>,
208    address: u64,
209    results: &mut Vec<(BTreeV1Key, u64)>,
210) -> Result<()> {
211    if Cursor::is_undefined_offset(address, context.offset_size) {
212        return Ok(());
213    }
214
215    if address as usize >= context.data.len() {
216        return Err(Error::OffsetOutOfBounds(address));
217    }
218
219    let mut cursor = Cursor::new(context.data);
220    cursor.set_position(address);
221
222    let node = BTreeV1Node::parse(
223        &mut cursor,
224        context.offset_size,
225        context.length_size,
226        context.ndims,
227    )?;
228
229    if node.level == 0 {
230        // Leaf node — collect (key[i], child[i]) pairs.
231        for (i, child_addr) in node.children.iter().enumerate() {
232            let include = match &node.keys[i] {
233                BTreeV1Key::RawData { offsets, .. } => {
234                    context.chunk_bounds.map_or(true, |(first, last)| {
235                        offsets
236                            .iter()
237                            .zip(context.chunk_dims.iter())
238                            .enumerate()
239                            .all(|(dim, (offset, chunk_dim))| {
240                                let chunk_index = *offset / u64::from(*chunk_dim);
241                                chunk_index >= first[dim] && chunk_index <= last[dim]
242                            })
243                    })
244                }
245                _ => true,
246            };
247
248            if include {
249                results.push((node.keys[i].clone(), *child_addr));
250            }
251        }
252    } else {
253        // Internal node — recurse into each child.
254        for child_addr in &node.children {
255            collect_recursive(context, *child_addr, results)?;
256        }
257    }
258
259    Ok(())
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    /// Build a v1 B-tree node (type 0, group) from the given parameters.
267    #[allow(clippy::too_many_arguments)]
268    fn build_group_node(
269        level: u8,
270        entries_used: u16,
271        left_sibling: u64,
272        right_sibling: u64,
273        keys: &[u64],     // local heap offsets
274        children: &[u64], // child addresses
275        offset_size: u8,
276        length_size: u8,
277    ) -> Vec<u8> {
278        assert_eq!(keys.len(), entries_used as usize + 1);
279        assert_eq!(children.len(), entries_used as usize);
280
281        let mut buf = Vec::new();
282        buf.extend_from_slice(b"TREE");
283        buf.push(0); // type 0 = group
284        buf.push(level);
285        buf.extend_from_slice(&entries_used.to_le_bytes());
286        push_offset(&mut buf, left_sibling, offset_size);
287        push_offset(&mut buf, right_sibling, offset_size);
288
289        for i in 0..keys.len() {
290            push_length(&mut buf, keys[i], length_size);
291            if i < children.len() {
292                push_offset(&mut buf, children[i], offset_size);
293            }
294        }
295
296        buf
297    }
298
299    /// Build a v1 B-tree node (type 1, raw data) from the given parameters.
300    fn build_rawdata_node(
301        level: u8,
302        entries_used: u16,
303        left_sibling: u64,
304        right_sibling: u64,
305        keys: &[(u32, u32, Vec<u64>)], // (chunk_size, filter_mask, offsets)
306        children: &[u64],
307        offset_size: u8,
308    ) -> Vec<u8> {
309        assert_eq!(keys.len(), entries_used as usize + 1);
310        assert_eq!(children.len(), entries_used as usize);
311
312        let mut buf = Vec::new();
313        buf.extend_from_slice(b"TREE");
314        buf.push(1); // type 1 = raw data
315        buf.push(level);
316        buf.extend_from_slice(&entries_used.to_le_bytes());
317        push_offset(&mut buf, left_sibling, offset_size);
318        push_offset(&mut buf, right_sibling, offset_size);
319
320        for i in 0..keys.len() {
321            let (cs, fm, ref offs) = keys[i];
322            buf.extend_from_slice(&cs.to_le_bytes());
323            buf.extend_from_slice(&fm.to_le_bytes());
324            for &o in offs {
325                push_offset(&mut buf, o, offset_size);
326            }
327            if i < children.len() {
328                push_offset(&mut buf, children[i], offset_size);
329            }
330        }
331
332        buf
333    }
334
335    fn push_offset(buf: &mut Vec<u8>, val: u64, size: u8) {
336        match size {
337            4 => buf.extend_from_slice(&(val as u32).to_le_bytes()),
338            8 => buf.extend_from_slice(&val.to_le_bytes()),
339            _ => panic!("unsupported offset size in test"),
340        }
341    }
342
343    fn push_length(buf: &mut Vec<u8>, val: u64, size: u8) {
344        match size {
345            4 => buf.extend_from_slice(&(val as u32).to_le_bytes()),
346            8 => buf.extend_from_slice(&val.to_le_bytes()),
347            _ => panic!("unsupported length size in test"),
348        }
349    }
350
351    #[test]
352    fn test_parse_group_leaf_node() {
353        let undef8 = 0xFFFF_FFFF_FFFF_FFFFu64;
354        let data = build_group_node(
355            0,                 // leaf
356            2,                 // 2 entries
357            undef8,            // no left sibling
358            undef8,            // no right sibling
359            &[0, 8, 16],       // 3 keys (offsets into local heap)
360            &[0x1000, 0x2000], // 2 children (SNOD addresses)
361            8,
362            8,
363        );
364
365        let mut cursor = Cursor::new(&data);
366        let node = BTreeV1Node::parse(&mut cursor, 8, 8, None).unwrap();
367
368        assert_eq!(node.node_type, 0);
369        assert_eq!(node.level, 0);
370        assert_eq!(node.entries_used, 2);
371        assert!(Cursor::is_undefined_offset(node.left_sibling, 8));
372        assert!(Cursor::is_undefined_offset(node.right_sibling, 8));
373        assert_eq!(node.keys.len(), 3);
374        assert_eq!(node.children.len(), 2);
375
376        match &node.keys[0] {
377            BTreeV1Key::Group { local_heap_offset } => assert_eq!(*local_heap_offset, 0),
378            _ => panic!("expected Group key"),
379        }
380        match &node.keys[1] {
381            BTreeV1Key::Group { local_heap_offset } => assert_eq!(*local_heap_offset, 8),
382            _ => panic!("expected Group key"),
383        }
384        assert_eq!(node.children[0], 0x1000);
385        assert_eq!(node.children[1], 0x2000);
386    }
387
388    #[test]
389    fn test_parse_rawdata_leaf_node() {
390        let undef8 = 0xFFFF_FFFF_FFFF_FFFFu64;
391        // 2D dataset, ndims=2, so keys have 3 offsets each (ndims+1)
392        let data = build_rawdata_node(
393            0, // leaf
394            1, // 1 entry
395            undef8,
396            undef8,
397            &[
398                (1024, 0, vec![0, 0, 0]),   // key[0]: chunk at origin
399                (1024, 0, vec![10, 20, 0]), // key[1]: next chunk boundary
400            ],
401            &[0x5000], // 1 child = chunk data address
402            8,
403        );
404
405        let mut cursor = Cursor::new(&data);
406        let node = BTreeV1Node::parse(&mut cursor, 8, 8, Some(2)).unwrap();
407
408        assert_eq!(node.node_type, 1);
409        assert_eq!(node.level, 0);
410        assert_eq!(node.entries_used, 1);
411        assert_eq!(node.keys.len(), 2);
412        assert_eq!(node.children.len(), 1);
413
414        match &node.keys[0] {
415            BTreeV1Key::RawData {
416                chunk_size,
417                filter_mask,
418                offsets,
419            } => {
420                assert_eq!(*chunk_size, 1024);
421                assert_eq!(*filter_mask, 0);
422                assert_eq!(offsets, &[0, 0, 0]);
423            }
424            _ => panic!("expected RawData key"),
425        }
426
427        assert_eq!(node.children[0], 0x5000);
428    }
429
430    #[test]
431    fn test_parse_rawdata_leaf_4byte() {
432        let undef4 = 0xFFFF_FFFFu64;
433        let data = build_rawdata_node(
434            0,
435            1,
436            undef4,
437            undef4,
438            &[
439                (512, 0x03, vec![0, 0]), // ndims=1, key has 2 offsets
440                (512, 0, vec![100, 0]),
441            ],
442            &[0x3000],
443            4,
444        );
445
446        let mut cursor = Cursor::new(&data);
447        let node = BTreeV1Node::parse(&mut cursor, 4, 4, Some(1)).unwrap();
448
449        assert_eq!(node.node_type, 1);
450        match &node.keys[0] {
451            BTreeV1Key::RawData {
452                filter_mask,
453                offsets,
454                ..
455            } => {
456                assert_eq!(*filter_mask, 0x03);
457                assert_eq!(offsets, &[0, 0]);
458            }
459            _ => panic!("expected RawData key"),
460        }
461    }
462
463    #[test]
464    fn test_bad_signature() {
465        let mut data = build_group_node(0, 0, u64::MAX, u64::MAX, &[0], &[], 8, 8);
466        data[0] = b'X'; // corrupt
467        let mut cursor = Cursor::new(&data);
468        assert!(matches!(
469            BTreeV1Node::parse(&mut cursor, 8, 8, None),
470            Err(Error::InvalidBTreeSignature)
471        ));
472    }
473
474    #[test]
475    fn test_collect_leaves_single_leaf() {
476        let undef8 = u64::MAX;
477        // Put the leaf node at offset 0 in our fake file data.
478        let node_data = build_group_node(0, 2, undef8, undef8, &[0, 5, 10], &[0x100, 0x200], 8, 8);
479
480        let results = collect_btree_v1_leaves(&node_data, 0, 8, 8, None, &[], None).unwrap();
481
482        assert_eq!(results.len(), 2);
483        match &results[0].0 {
484            BTreeV1Key::Group { local_heap_offset } => assert_eq!(*local_heap_offset, 0),
485            _ => panic!("expected Group key"),
486        }
487        assert_eq!(results[0].1, 0x100);
488        assert_eq!(results[1].1, 0x200);
489    }
490
491    #[test]
492    fn test_collect_leaves_two_level_tree() {
493        let undef8 = u64::MAX;
494
495        // Build a two-level tree:
496        //   Root (level 1) at offset 0, with 2 children
497        //     Leaf A at offset 1000
498        //     Leaf B at offset 2000
499
500        let leaf_a = build_group_node(0, 1, undef8, undef8, &[0, 5], &[0xA00], 8, 8);
501
502        let leaf_b = build_group_node(0, 1, undef8, undef8, &[10, 15], &[0xB00], 8, 8);
503
504        let root = build_group_node(
505            1,
506            2,
507            undef8,
508            undef8,
509            &[0, 5, 15],   // 3 keys for 2 children
510            &[1000, 2000], // children at offsets 1000 and 2000
511            8,
512            8,
513        );
514
515        // Assemble into a single buffer: root at 0, leaf_a at 1000, leaf_b at 2000.
516        let total_size = 3000 + leaf_b.len();
517        let mut file_data = vec![0u8; total_size];
518        file_data[..root.len()].copy_from_slice(&root);
519        file_data[1000..1000 + leaf_a.len()].copy_from_slice(&leaf_a);
520        file_data[2000..2000 + leaf_b.len()].copy_from_slice(&leaf_b);
521
522        let results = collect_btree_v1_leaves(&file_data, 0, 8, 8, None, &[], None).unwrap();
523
524        assert_eq!(results.len(), 2);
525        assert_eq!(results[0].1, 0xA00);
526        assert_eq!(results[1].1, 0xB00);
527    }
528
529    #[test]
530    fn test_collect_undefined_root() {
531        // Undefined root address should produce empty results.
532        let data = vec![0u8; 100];
533        let results = collect_btree_v1_leaves(&data, u64::MAX, 8, 8, None, &[], None).unwrap();
534        assert!(results.is_empty());
535    }
536}