Skip to main content

hdf5_reader/
btree_v2.rs

1//! HDF5 B-tree Version 2.
2//!
3//! V2 B-trees are used by newer-style groups and datasets for indexed link
4//! storage, attribute storage, and chunked dataset indexing. The header
5//! (`BTHD`) describes the tree parameters. Internal nodes (`BTIN`) and leaf
6//! nodes (`BTLF`) contain the actual records.
7//!
8//! This module provides the header parse, record types, and a traversal
9//! function that collects all records from a tree.
10
11use crate::checksum::jenkins_lookup3;
12use crate::error::{Error, Result};
13use crate::io::Cursor;
14
15// ---------------------------------------------------------------------------
16// Signatures
17// ---------------------------------------------------------------------------
18
19const BTHD_SIGNATURE: [u8; 4] = *b"BTHD";
20const BTIN_SIGNATURE: [u8; 4] = *b"BTIN";
21const BTLF_SIGNATURE: [u8; 4] = *b"BTLF";
22
23// ---------------------------------------------------------------------------
24// Header
25// ---------------------------------------------------------------------------
26
27/// Parsed B-tree v2 header.
28#[derive(Debug, Clone)]
29pub struct BTreeV2Header {
30    /// B-tree type (determines the record format).
31    pub btree_type: u8,
32    /// Size in bytes of each B-tree node (both internal and leaf).
33    pub node_size: u32,
34    /// Size in bytes of each record.
35    pub record_size: u16,
36    /// Depth of the tree (0 = root is a leaf).
37    pub depth: u16,
38    /// Percent full at which to split a node.
39    pub split_percent: u8,
40    /// Percent full at which to merge a node.
41    pub merge_percent: u8,
42    /// Address of the root node.
43    pub root_node_address: u64,
44    /// Number of records in the root node.
45    pub num_records_in_root: u16,
46    /// Total number of records in the entire tree.
47    pub total_records: u64,
48}
49
50impl BTreeV2Header {
51    /// Parse a B-tree v2 header at the current cursor position.
52    ///
53    /// Format:
54    /// - Signature: `BTHD` (4 bytes)
55    /// - Version: 0 (1 byte)
56    /// - B-tree type (u8)
57    /// - Node size (u32 LE)
58    /// - Record size (u16 LE)
59    /// - Depth (u16 LE)
60    /// - Split percent (u8)
61    /// - Merge percent (u8)
62    /// - Root node address (`offset_size` bytes)
63    /// - Number of records in root node (u16 LE)
64    /// - Total number of records in tree (`length_size` bytes)
65    /// - Checksum (u32 LE)
66    pub fn parse(cursor: &mut Cursor, offset_size: u8, length_size: u8) -> Result<Self> {
67        let start = cursor.position();
68
69        let sig = cursor.read_bytes(4)?;
70        if sig != BTHD_SIGNATURE {
71            return Err(Error::InvalidBTreeV2Signature { context: "header" });
72        }
73
74        let version = cursor.read_u8()?;
75        if version != 0 {
76            return Err(Error::UnsupportedBTreeVersion(version));
77        }
78
79        let btree_type = cursor.read_u8()?;
80        let node_size = cursor.read_u32_le()?;
81        let record_size = cursor.read_u16_le()?;
82        let depth = cursor.read_u16_le()?;
83        let split_percent = cursor.read_u8()?;
84        let merge_percent = cursor.read_u8()?;
85        let root_node_address = cursor.read_offset(offset_size)?;
86        let num_records_in_root = cursor.read_u16_le()?;
87        let total_records = cursor.read_length(length_size)?;
88
89        // Checksum covers everything from signature through total_records.
90        let checksum_end = cursor.position();
91        let stored_checksum = cursor.read_u32_le()?;
92
93        let computed = jenkins_lookup3(&cursor.data()[start as usize..checksum_end as usize]);
94        if computed != stored_checksum {
95            return Err(Error::ChecksumMismatch {
96                expected: stored_checksum,
97                actual: computed,
98            });
99        }
100
101        Ok(BTreeV2Header {
102            btree_type,
103            node_size,
104            record_size,
105            depth,
106            split_percent,
107            merge_percent,
108            root_node_address,
109            num_records_in_root,
110            total_records,
111        })
112    }
113}
114
115// ---------------------------------------------------------------------------
116// Records
117// ---------------------------------------------------------------------------
118
119/// A record from a B-tree v2.
120///
121/// The record format depends on the B-tree type field in the header.
122#[derive(Debug, Clone)]
123pub enum BTreeV2Record {
124    /// Type 5: Link name for indexed group (hashed).
125    LinkNameHash { hash: u32, heap_id: Vec<u8> },
126    /// Type 6: Creation order for indexed group.
127    CreationOrder { order: u64, heap_id: Vec<u8> },
128    /// Type 8: Attribute name for indexed group (hashed).
129    AttributeNameHash {
130        hash: u32,
131        flags: u8,
132        creation_order: u32,
133        heap_id: Vec<u8>,
134    },
135    /// Type 9: Attribute creation order.
136    AttributeCreationOrder { order: u32, heap_id: Vec<u8> },
137    /// Type 10: Non-filtered chunked dataset record (v2 chunk index).
138    ChunkedNonFiltered { address: u64, offsets: Vec<u64> },
139    /// Type 11: Filtered chunked dataset record (v2 chunk index).
140    ChunkedFiltered {
141        address: u64,
142        chunk_size: u64,
143        filter_mask: u32,
144        offsets: Vec<u64>,
145    },
146    /// Unknown/unsupported record type — raw bytes preserved.
147    Unknown { record_type: u8, data: Vec<u8> },
148}
149
150// ---------------------------------------------------------------------------
151// Record parsing
152// ---------------------------------------------------------------------------
153
154/// Parse a single record of the given B-tree type.
155fn parse_record(
156    cursor: &mut Cursor,
157    btree_type: u8,
158    record_size: u16,
159    offset_size: u8,
160    length_size: u8,
161    _ndims: Option<u32>,
162    heap_id_len: usize,
163) -> Result<BTreeV2Record> {
164    let record_start = cursor.position();
165
166    let record = match btree_type {
167        // Type 5: link name hash
168        5 => {
169            let hash = cursor.read_u32_le()?;
170            let heap_id = cursor.read_bytes(heap_id_len)?.to_vec();
171            BTreeV2Record::LinkNameHash { hash, heap_id }
172        }
173
174        // Type 6: creation order
175        6 => {
176            let order = cursor.read_u64_le()?;
177            let heap_id = cursor.read_bytes(heap_id_len)?.to_vec();
178            BTreeV2Record::CreationOrder { order, heap_id }
179        }
180
181        // Type 8: attribute name hash
182        8 => {
183            let hash = cursor.read_u32_le()?;
184            let flags = cursor.read_u8()?;
185            let creation_order = cursor.read_u32_le()?;
186            let heap_id = cursor.read_bytes(heap_id_len)?.to_vec();
187            BTreeV2Record::AttributeNameHash {
188                hash,
189                flags,
190                creation_order,
191                heap_id,
192            }
193        }
194
195        // Type 9: attribute creation order
196        9 => {
197            let order = cursor.read_u32_le()?;
198            let heap_id = cursor.read_bytes(heap_id_len)?.to_vec();
199            BTreeV2Record::AttributeCreationOrder { order, heap_id }
200        }
201
202        // Type 10: non-filtered chunk
203        10 => {
204            let address = cursor.read_offset(offset_size)?;
205            // Chunk offsets are encoded as scaled 64-bit values.
206            // The number of offset dimensions is calculated from the record size.
207            // Each offset is 8 bytes in a type-10 record.
208            let offset_bytes_available = record_size as usize - offset_size as usize;
209            let num_offsets = offset_bytes_available / 8;
210            let mut offsets = Vec::with_capacity(num_offsets);
211            for _ in 0..num_offsets {
212                offsets.push(cursor.read_u64_le()?);
213            }
214            BTreeV2Record::ChunkedNonFiltered { address, offsets }
215        }
216
217        // Type 11: filtered chunk
218        11 => {
219            let address = cursor.read_offset(offset_size)?;
220            // nbytes (chunk size on disk) is encoded using length_size bytes.
221            let nbytes_size = length_size as usize;
222            let chunk_size = cursor.read_length(length_size)?;
223            let filter_mask = cursor.read_u32_le()?;
224            let remaining = record_size as usize - offset_size as usize - nbytes_size - 4; // filter_mask
225            let num_offsets = remaining / 8;
226            let mut offsets = Vec::with_capacity(num_offsets);
227            for _ in 0..num_offsets {
228                offsets.push(cursor.read_u64_le()?);
229            }
230            BTreeV2Record::ChunkedFiltered {
231                address,
232                chunk_size,
233                filter_mask,
234                offsets,
235            }
236        }
237
238        // Unknown type — read raw bytes.
239        _ => {
240            let data = cursor.read_bytes(record_size as usize)?.to_vec();
241            return Ok(BTreeV2Record::Unknown {
242                record_type: btree_type,
243                data,
244            });
245        }
246    };
247
248    // Ensure we consumed exactly record_size bytes (skip any remaining).
249    let consumed = (cursor.position() - record_start) as usize;
250    if consumed < record_size as usize {
251        cursor.skip(record_size as usize - consumed)?;
252    }
253
254    Ok(record)
255}
256
257fn record_matches_chunk_bounds(
258    record: &BTreeV2Record,
259    chunk_dims: &[u32],
260    chunk_bounds: Option<(&[u64], &[u64])>,
261) -> bool {
262    let Some((first_chunk, last_chunk)) = chunk_bounds else {
263        return true;
264    };
265
266    let offsets = match record {
267        BTreeV2Record::ChunkedNonFiltered { offsets, .. }
268        | BTreeV2Record::ChunkedFiltered { offsets, .. } => offsets,
269        _ => return true,
270    };
271
272    offsets.iter().enumerate().all(|(dim, offset)| {
273        let chunk_index = *offset / u64::from(chunk_dims[dim]);
274        chunk_index >= first_chunk[dim] && chunk_index <= last_chunk[dim]
275    })
276}
277
278// ---------------------------------------------------------------------------
279// Node parsing
280// ---------------------------------------------------------------------------
281
282/// Compute the number of bytes needed to represent `max_records` as an
283/// unsigned integer (used for child-node record counts in internal nodes).
284fn num_records_size(max_records: u64) -> usize {
285    if max_records <= 0xFF {
286        1
287    } else if max_records <= 0xFFFF {
288        2
289    } else if max_records <= 0xFFFF_FFFF {
290        4
291    } else {
292        8
293    }
294}
295
296/// Compute the maximum number of records that fit in a leaf node.
297fn max_leaf_records(node_size: u32, record_size: u16) -> u64 {
298    // Leaf node overhead: signature(4) + version(1) + type(1) + checksum(4) = 10
299    let overhead = 10u32;
300    if node_size <= overhead || record_size == 0 {
301        return 0;
302    }
303    ((node_size - overhead) / record_size as u32) as u64
304}
305
306/// Compute the maximum number of records that fit in an internal node.
307/// This depends on the pointer size (offset_size) and the number-of-records
308/// encoding for child nodes, which makes it recursive in principle. We use
309/// an iterative approach.
310fn max_internal_records(
311    node_size: u32,
312    record_size: u16,
313    offset_size: u8,
314    max_child_records: u64,
315) -> u64 {
316    // Internal node overhead: signature(4) + version(1) + type(1) + checksum(4) = 10
317    let overhead = 10u32;
318    if node_size <= overhead || record_size == 0 {
319        return 0;
320    }
321    let available = (node_size - overhead) as u64;
322    // Each entry in an internal node is: record(record_size) + child_pointer(offset_size) + num_records(var)
323    let nrec_size = num_records_size(max_child_records) as u64;
324    let entry_size = record_size as u64 + offset_size as u64 + nrec_size;
325    // There is one more child pointer + num_records than records.
326    // So: n * record_size + (n+1) * (offset_size + nrec_size) <= available
327    // => n * (record_size + offset_size + nrec_size) + offset_size + nrec_size <= available
328    let extra = offset_size as u64 + nrec_size;
329    if available <= extra {
330        return 0;
331    }
332    (available - extra) / entry_size
333}
334
335/// Parse a leaf node and collect its records.
336#[allow(clippy::too_many_arguments)]
337fn parse_leaf_node(
338    cursor: &mut Cursor,
339    header: &BTreeV2Header,
340    offset_size: u8,
341    length_size: u8,
342    ndims: Option<u32>,
343    chunk_dims: &[u32],
344    chunk_bounds: Option<(&[u64], &[u64])>,
345    num_records: u16,
346    heap_id_len: usize,
347    records: &mut Vec<BTreeV2Record>,
348) -> Result<()> {
349    let start = cursor.position();
350
351    let sig = cursor.read_bytes(4)?;
352    if sig != BTLF_SIGNATURE {
353        return Err(Error::InvalidBTreeV2Signature {
354            context: "leaf node",
355        });
356    }
357
358    let version = cursor.read_u8()?;
359    if version != 0 {
360        return Err(Error::UnsupportedBTreeVersion(version));
361    }
362
363    let node_type = cursor.read_u8()?;
364    if node_type != header.btree_type {
365        return Err(Error::InvalidData(format!(
366            "B-tree v2 leaf node type mismatch: header says {}, node says {}",
367            header.btree_type, node_type
368        )));
369    }
370
371    for _ in 0..num_records {
372        let record = parse_record(
373            cursor,
374            header.btree_type,
375            header.record_size,
376            offset_size,
377            length_size,
378            ndims,
379            heap_id_len,
380        )?;
381        if record_matches_chunk_bounds(&record, chunk_dims, chunk_bounds) {
382            records.push(record);
383        }
384    }
385
386    // Verify checksum: covers signature through the end of records.
387    let checksum_data_end = cursor.position();
388    let stored_checksum = cursor.read_u32_le()?;
389    let computed = jenkins_lookup3(&cursor.data()[start as usize..checksum_data_end as usize]);
390    if computed != stored_checksum {
391        return Err(Error::ChecksumMismatch {
392            expected: stored_checksum,
393            actual: computed,
394        });
395    }
396
397    Ok(())
398}
399
400/// Parse an internal node, collecting child addresses and recursing.
401#[allow(clippy::too_many_arguments)]
402fn parse_internal_node(
403    data: &[u8],
404    address: u64,
405    header: &BTreeV2Header,
406    offset_size: u8,
407    length_size: u8,
408    ndims: Option<u32>,
409    chunk_dims: &[u32],
410    chunk_bounds: Option<(&[u64], &[u64])>,
411    num_records: u16,
412    depth: u16,
413    heap_id_len: usize,
414    records: &mut Vec<BTreeV2Record>,
415) -> Result<()> {
416    let mut cursor = Cursor::new(data);
417    cursor.set_position(address);
418
419    let start = cursor.position();
420
421    let sig = cursor.read_bytes(4)?;
422    if sig != BTIN_SIGNATURE {
423        return Err(Error::InvalidBTreeV2Signature {
424            context: "internal node",
425        });
426    }
427
428    let version = cursor.read_u8()?;
429    if version != 0 {
430        return Err(Error::UnsupportedBTreeVersion(version));
431    }
432
433    let node_type = cursor.read_u8()?;
434    if node_type != header.btree_type {
435        return Err(Error::InvalidData(format!(
436            "B-tree v2 internal node type mismatch: header says {}, node says {}",
437            header.btree_type, node_type
438        )));
439    }
440
441    // Compute max records for children to know the encoding size for
442    // child record counts.
443    let max_child_records = if depth == 1 {
444        max_leaf_records(header.node_size, header.record_size)
445    } else {
446        // For deeper trees, compute iteratively.
447        let leaf_max = max_leaf_records(header.node_size, header.record_size);
448        let mut prev_max = leaf_max;
449        for _ in 1..depth {
450            prev_max =
451                max_internal_records(header.node_size, header.record_size, offset_size, prev_max);
452        }
453        prev_max
454    };
455    let nrec_bytes = num_records_size(max_child_records);
456
457    // Read records and child pointers interleaved:
458    // child[0], record[0], child[1], record[1], ..., record[n-1], child[n]
459    // Plus total_records counts for each child.
460    //
461    // Actually per the HDF5 spec the layout is:
462    // record[0], record[1], ..., record[n-1],
463    // child_ptr[0], nrec[0], total[0], child_ptr[1], nrec[1], total[1], ..., child_ptr[n], nrec[n], total[n]
464    //
465    // Records first, then child pointers with their metadata.
466
467    // Read all records.
468    let mut node_records = Vec::with_capacity(num_records as usize);
469    for _ in 0..num_records {
470        let record = parse_record(
471            &mut cursor,
472            header.btree_type,
473            header.record_size,
474            offset_size,
475            length_size,
476            ndims,
477            heap_id_len,
478        )?;
479        if record_matches_chunk_bounds(&record, chunk_dims, chunk_bounds) {
480            node_records.push(record);
481        }
482    }
483
484    // Read child node pointers (num_records + 1 of them).
485    let num_children = num_records as usize + 1;
486
487    // Whether to include a "total records" field for each child.
488    // This is present when depth > 1.
489    let has_total_records = depth > 1;
490    // total_records encoding size for deeper nodes
491    let total_nrec_bytes = if has_total_records {
492        // Total records in a sub-tree — need enough bytes to hold the
493        // maximum total records. We use length_size as an upper bound.
494        length_size as usize
495    } else {
496        0
497    };
498
499    let mut child_addresses = Vec::with_capacity(num_children);
500    let mut child_nrecords = Vec::with_capacity(num_children);
501
502    for _ in 0..num_children {
503        let child_addr = cursor.read_offset(offset_size)?;
504        child_addresses.push(child_addr);
505        let nrec = cursor.read_uvar(nrec_bytes)?;
506        child_nrecords.push(nrec as u16);
507        if has_total_records {
508            // Skip total records count
509            cursor.read_uvar(total_nrec_bytes)?;
510        }
511    }
512
513    // Verify checksum.
514    let checksum_data_end = cursor.position();
515    let stored_checksum = cursor.read_u32_le()?;
516    let computed = jenkins_lookup3(&cursor.data()[start as usize..checksum_data_end as usize]);
517    if computed != stored_checksum {
518        return Err(Error::ChecksumMismatch {
519            expected: stored_checksum,
520            actual: computed,
521        });
522    }
523
524    // The records from this internal node are NOT included in the leaf
525    // collection — they are separators. We do NOT add them to results.
526    // Only leaf records are collected.
527    // (Actually, in HDF5 v2 B-trees, records in internal nodes are real
528    // records too, not just separators. We should collect them.)
529    records.extend(node_records);
530
531    // Recurse into children.
532    let child_depth = depth - 1;
533    for (i, &child_addr) in child_addresses.iter().enumerate() {
534        if Cursor::is_undefined_offset(child_addr, offset_size) {
535            continue;
536        }
537        let child_nrec = child_nrecords[i];
538        if child_depth == 0 {
539            // Child is a leaf.
540            let mut child_cursor = Cursor::new(data);
541            child_cursor.set_position(child_addr);
542            parse_leaf_node(
543                &mut child_cursor,
544                header,
545                offset_size,
546                length_size,
547                ndims,
548                chunk_dims,
549                chunk_bounds,
550                child_nrec,
551                heap_id_len,
552                records,
553            )?;
554        } else {
555            // Child is another internal node.
556            parse_internal_node(
557                data,
558                child_addr,
559                header,
560                offset_size,
561                length_size,
562                ndims,
563                chunk_dims,
564                chunk_bounds,
565                child_nrec,
566                child_depth,
567                heap_id_len,
568                records,
569            )?;
570        }
571    }
572
573    Ok(())
574}
575
576// ---------------------------------------------------------------------------
577// Public traversal
578// ---------------------------------------------------------------------------
579
580/// Collect all records from a B-tree v2 by traversing from the root.
581///
582/// `header` must be a previously parsed `BTreeV2Header`. `data` is the full
583/// file buffer. `ndims` is needed for chunk index record types (10 and 11).
584pub fn collect_btree_v2_records(
585    data: &[u8],
586    header: &BTreeV2Header,
587    offset_size: u8,
588    length_size: u8,
589    ndims: Option<u32>,
590    chunk_dims: &[u32],
591    chunk_bounds: Option<(&[u64], &[u64])>,
592) -> Result<Vec<BTreeV2Record>> {
593    if Cursor::is_undefined_offset(header.root_node_address, offset_size) {
594        return Ok(Vec::new());
595    }
596
597    if header.total_records == 0 || header.num_records_in_root == 0 {
598        return Ok(Vec::new());
599    }
600
601    // Determine heap_id_len from the record_size and btree_type.
602    let heap_id_len = compute_heap_id_len(header);
603
604    let mut records = Vec::new();
605
606    if header.depth == 0 {
607        // Root is a leaf node.
608        let mut cursor = Cursor::new(data);
609        cursor.set_position(header.root_node_address);
610        parse_leaf_node(
611            &mut cursor,
612            header,
613            offset_size,
614            length_size,
615            ndims,
616            chunk_dims,
617            chunk_bounds,
618            header.num_records_in_root,
619            heap_id_len,
620            &mut records,
621        )?;
622    } else {
623        // Root is an internal node.
624        parse_internal_node(
625            data,
626            header.root_node_address,
627            header,
628            offset_size,
629            length_size,
630            ndims,
631            chunk_dims,
632            chunk_bounds,
633            header.num_records_in_root,
634            header.depth,
635            heap_id_len,
636            &mut records,
637        )?;
638    }
639
640    Ok(records)
641}
642
643/// Compute the heap ID length from the record size and tree type.
644///
645/// For link/attribute B-trees (types 5, 6, 8, 9), the heap ID occupies
646/// the remaining bytes after the fixed fields. For chunk types (10, 11)
647/// or unknown types, return 0 (heap_id is not used).
648fn compute_heap_id_len(header: &BTreeV2Header) -> usize {
649    let rs = header.record_size as usize;
650    match header.btree_type {
651        5 => rs.saturating_sub(4),         // hash(4)
652        6 => rs.saturating_sub(8),         // order(8)
653        8 => rs.saturating_sub(4 + 1 + 4), // hash(4) + flags(1) + creation_order(4)
654        9 => rs.saturating_sub(4),         // order(4)
655        _ => 0,
656    }
657}
658
659#[cfg(test)]
660mod tests {
661    use super::*;
662
663    /// Build a minimal BTHD with the given parameters.
664    #[allow(clippy::too_many_arguments)]
665    fn build_header(
666        btree_type: u8,
667        node_size: u32,
668        record_size: u16,
669        depth: u16,
670        root_node_address: u64,
671        num_records_in_root: u16,
672        total_records: u64,
673        offset_size: u8,
674        length_size: u8,
675    ) -> Vec<u8> {
676        let mut buf = Vec::new();
677        buf.extend_from_slice(b"BTHD");
678        buf.push(0); // version
679        buf.push(btree_type);
680        buf.extend_from_slice(&node_size.to_le_bytes());
681        buf.extend_from_slice(&record_size.to_le_bytes());
682        buf.extend_from_slice(&depth.to_le_bytes());
683        buf.push(75); // split percent
684        buf.push(40); // merge percent
685        match offset_size {
686            4 => buf.extend_from_slice(&(root_node_address as u32).to_le_bytes()),
687            8 => buf.extend_from_slice(&root_node_address.to_le_bytes()),
688            _ => panic!("unsupported"),
689        }
690        buf.extend_from_slice(&num_records_in_root.to_le_bytes());
691        match length_size {
692            4 => buf.extend_from_slice(&(total_records as u32).to_le_bytes()),
693            8 => buf.extend_from_slice(&total_records.to_le_bytes()),
694            _ => panic!("unsupported"),
695        }
696        // Compute and append checksum.
697        let checksum = jenkins_lookup3(&buf);
698        buf.extend_from_slice(&checksum.to_le_bytes());
699        buf
700    }
701
702    #[test]
703    fn test_parse_header() {
704        let data = build_header(5, 4096, 12, 0, 0x1000, 3, 3, 8, 8);
705        let mut cursor = Cursor::new(&data);
706        let hdr = BTreeV2Header::parse(&mut cursor, 8, 8).unwrap();
707
708        assert_eq!(hdr.btree_type, 5);
709        assert_eq!(hdr.node_size, 4096);
710        assert_eq!(hdr.record_size, 12);
711        assert_eq!(hdr.depth, 0);
712        assert_eq!(hdr.split_percent, 75);
713        assert_eq!(hdr.merge_percent, 40);
714        assert_eq!(hdr.root_node_address, 0x1000);
715        assert_eq!(hdr.num_records_in_root, 3);
716        assert_eq!(hdr.total_records, 3);
717    }
718
719    #[test]
720    fn test_bad_signature() {
721        let mut data = build_header(5, 4096, 12, 0, 0x1000, 0, 0, 8, 8);
722        data[0] = b'X';
723        let mut cursor = Cursor::new(&data);
724        assert!(matches!(
725            BTreeV2Header::parse(&mut cursor, 8, 8),
726            Err(Error::InvalidBTreeV2Signature { .. })
727        ));
728    }
729
730    #[test]
731    fn test_bad_checksum() {
732        let mut data = build_header(5, 4096, 12, 0, 0x1000, 0, 0, 8, 8);
733        // Corrupt a byte in the middle.
734        data[6] = 0xFF;
735        let mut cursor = Cursor::new(&data);
736        assert!(matches!(
737            BTreeV2Header::parse(&mut cursor, 8, 8),
738            Err(Error::ChecksumMismatch { .. })
739        ));
740    }
741
742    #[test]
743    fn test_collect_empty_tree() {
744        let header = BTreeV2Header {
745            btree_type: 5,
746            node_size: 4096,
747            record_size: 12,
748            depth: 0,
749            split_percent: 75,
750            merge_percent: 40,
751            root_node_address: u64::MAX,
752            num_records_in_root: 0,
753            total_records: 0,
754        };
755        let data = vec![0u8; 100];
756        let records = collect_btree_v2_records(&data, &header, 8, 8, None, &[], None).unwrap();
757        assert!(records.is_empty());
758    }
759
760    #[test]
761    fn test_compute_heap_id_len() {
762        // Type 5: record_size - 4
763        let h5 = BTreeV2Header {
764            btree_type: 5,
765            record_size: 12,
766            node_size: 0,
767            depth: 0,
768            split_percent: 0,
769            merge_percent: 0,
770            root_node_address: 0,
771            num_records_in_root: 0,
772            total_records: 0,
773        };
774        assert_eq!(compute_heap_id_len(&h5), 8);
775
776        // Type 8: record_size - 9
777        let h8 = BTreeV2Header {
778            btree_type: 8,
779            record_size: 17,
780            ..h5
781        };
782        assert_eq!(compute_heap_id_len(&h8), 8);
783    }
784
785    #[test]
786    fn test_max_leaf_records() {
787        // node_size=4096, record_size=12, overhead=10
788        // => (4096 - 10) / 12 = 340
789        assert_eq!(max_leaf_records(4096, 12), 340);
790    }
791
792    #[test]
793    fn test_num_records_size() {
794        assert_eq!(num_records_size(0), 1);
795        assert_eq!(num_records_size(255), 1);
796        assert_eq!(num_records_size(256), 2);
797        assert_eq!(num_records_size(65535), 2);
798        assert_eq!(num_records_size(65536), 4);
799    }
800
801    #[test]
802    fn test_parse_leaf_with_type5_records() {
803        // Build a leaf node with 2 type-5 records (link name hash).
804        // record_size = 12 (hash=4 + heap_id=8)
805        let record_size: u16 = 12;
806        let node_size: u32 = 4096;
807
808        let header = BTreeV2Header {
809            btree_type: 5,
810            node_size,
811            record_size,
812            depth: 0,
813            split_percent: 75,
814            merge_percent: 40,
815            root_node_address: 0, // will point to our leaf
816            num_records_in_root: 2,
817            total_records: 2,
818        };
819
820        // Build the leaf node.
821        let mut leaf = Vec::new();
822        leaf.extend_from_slice(b"BTLF"); // signature
823        leaf.push(0); // version
824        leaf.push(5); // type
825
826        // Record 1: hash=0xAABBCCDD, heap_id=[1,2,3,4,5,6,7,8]
827        leaf.extend_from_slice(&0xAABBCCDDu32.to_le_bytes());
828        leaf.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]);
829
830        // Record 2: hash=0x11223344, heap_id=[9,10,11,12,13,14,15,16]
831        leaf.extend_from_slice(&0x11223344u32.to_le_bytes());
832        leaf.extend_from_slice(&[9, 10, 11, 12, 13, 14, 15, 16]);
833
834        // Checksum covers everything so far.
835        let checksum = jenkins_lookup3(&leaf);
836        leaf.extend_from_slice(&checksum.to_le_bytes());
837
838        // Pad to node_size (not strictly needed for parsing, but realistic).
839        leaf.resize(node_size as usize, 0);
840
841        let mut records = Vec::new();
842        let mut cursor = Cursor::new(&leaf);
843        parse_leaf_node(
844            &mut cursor,
845            &header,
846            8,
847            8,
848            None,
849            &[],
850            None,
851            2,
852            8, // heap_id_len
853            &mut records,
854        )
855        .unwrap();
856
857        assert_eq!(records.len(), 2);
858        match &records[0] {
859            BTreeV2Record::LinkNameHash { hash, heap_id } => {
860                assert_eq!(*hash, 0xAABBCCDD);
861                assert_eq!(heap_id, &[1, 2, 3, 4, 5, 6, 7, 8]);
862            }
863            _ => panic!("expected LinkNameHash"),
864        }
865        match &records[1] {
866            BTreeV2Record::LinkNameHash { hash, heap_id } => {
867                assert_eq!(*hash, 0x11223344);
868                assert_eq!(heap_id, &[9, 10, 11, 12, 13, 14, 15, 16]);
869            }
870            _ => panic!("expected LinkNameHash"),
871        }
872    }
873}