Skip to main content

async_hdf5/btree/
v1.rs

1use std::sync::Arc;
2
3use crate::endian::HDF5Reader;
4use crate::error::{HDF5Error, Result};
5use crate::heap::local::LocalHeap;
6use crate::reader::AsyncFileReader;
7
8/// A single entry from a v1 B-tree group node (type 0).
9///
10/// Maps a link name to a symbol table node address.
11#[derive(Debug, Clone)]
12pub struct SymbolTableEntry {
13    /// Link name (resolved from local heap).
14    pub name: String,
15    /// File address of the child object's header.
16    pub object_header_address: u64,
17    /// Cache type (0=none, 1=group, 2=symlink).
18    pub cache_type: u32,
19    /// For cache_type=1: child B-tree address (from scratch-pad).
20    pub child_btree_address: Option<u64>,
21    /// For cache_type=1: child local heap address (from scratch-pad).
22    pub child_heap_address: Option<u64>,
23}
24
25/// A v1 B-tree node for raw data chunks (type 1).
26///
27/// Each entry maps chunk coordinates to a byte offset and size in the file.
28#[derive(Debug, Clone)]
29pub struct ChunkEntry {
30    /// Size of the chunk in bytes (compressed/on-disk size).
31    pub size: u32,
32    /// Filter mask — bit N set means filter N was *not* applied.
33    pub filter_mask: u32,
34    /// Chunk offsets in each dimension.
35    pub offsets: Vec<u64>,
36    /// File address of the chunk data.
37    pub address: u64,
38}
39
40/// Traverse a v1 B-tree (type 0 — group nodes) and collect all symbol table entries.
41///
42/// This recursively follows the tree from the given root address, eventually
43/// reaching leaf nodes that point to Symbol Table Nodes (SNOD). Each SNOD
44/// contains symbol table entries mapping names (via the local heap) to object
45/// header addresses.
46pub async fn read_group_btree_v1(
47    reader: &Arc<dyn AsyncFileReader>,
48    btree_address: u64,
49    local_heap: &LocalHeap,
50    size_of_offsets: u8,
51    size_of_lengths: u8,
52) -> Result<Vec<SymbolTableEntry>> {
53    let mut entries = Vec::new();
54    read_group_btree_node(
55        reader,
56        btree_address,
57        local_heap,
58        size_of_offsets,
59        size_of_lengths,
60        &mut entries,
61    )
62    .await?;
63    Ok(entries)
64}
65
66/// Recursively read a v1 B-tree node.
67async fn read_group_btree_node(
68    reader: &Arc<dyn AsyncFileReader>,
69    address: u64,
70    local_heap: &LocalHeap,
71    size_of_offsets: u8,
72    size_of_lengths: u8,
73    entries: &mut Vec<SymbolTableEntry>,
74) -> Result<()> {
75    // Fetch enough data for the node header + reasonable number of entries.
76    // Node header: 4 (sig) + 1 (type) + 1 (level) + 2 (entries_used) + 2*O (siblings)
77    // Key size for type 0 = L bytes. Each entry = key(L) + child(O).
78    // Pattern: key[0], child[0], key[1], child[1], ..., key[N-1], child[N-1], key[N]
79    // Total keys = entries_used + 1, total children = entries_used.
80    // Conservative fetch: 8KB should cover most nodes.
81    let fetch_size = 8192u64;
82    let data = reader.get_bytes(address..address + fetch_size).await?;
83    let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
84
85    // Verify signature
86    r.read_signature(b"TREE")?;
87
88    let node_type = r.read_u8()?;
89    if node_type != 0 {
90        return Err(HDF5Error::General(format!(
91            "expected B-tree v1 node type 0 (group), got {node_type}"
92        )));
93    }
94
95    let node_level = r.read_u8()?;
96    let entries_used = r.read_u16()? as usize;
97    let _left_sibling = r.read_offset()?;
98    let _right_sibling = r.read_offset()?;
99
100    if node_level > 0 {
101        // Internal node: keys and child pointers alternate.
102        // key[0], child[0], key[1], child[1], ..., key[N-1], child[N-1], key[N]
103        for _ in 0..entries_used {
104            let _key = r.read_length()?; // heap offset (type 0 key)
105            let child_address = r.read_offset()?;
106
107            // Recurse into child node
108            Box::pin(read_group_btree_node(
109                reader,
110                child_address,
111                local_heap,
112                size_of_offsets,
113                size_of_lengths,
114                entries,
115            ))
116            .await?;
117        }
118        // Skip trailing key
119        let _trailing_key = r.read_length()?;
120    } else {
121        // Leaf node: child pointers are addresses of Symbol Table Nodes (SNOD).
122        let mut child_addresses = Vec::with_capacity(entries_used);
123        for _ in 0..entries_used {
124            let _key = r.read_length()?;
125            let child_address = r.read_offset()?;
126            child_addresses.push(child_address);
127        }
128        // Skip trailing key
129        let _trailing_key = r.read_length()?;
130
131        // Read each SNOD
132        for snod_address in child_addresses {
133            let snod_entries = read_symbol_table_node(
134                reader,
135                snod_address,
136                local_heap,
137                size_of_offsets,
138                size_of_lengths,
139            )
140            .await?;
141            entries.extend(snod_entries);
142        }
143    }
144
145    Ok(())
146}
147
148/// Read a Symbol Table Node (SNOD) and extract all its entries.
149///
150/// Binary layout:
151///   - Signature "SNOD" (4 bytes)
152///   - Version (1 byte): 1
153///   - Reserved (1 byte)
154///   - Number of Symbols (2 bytes)
155///   - Entries... (each: name_offset(O) + header_address(O) + cache_type(4) + reserved(4) + scratch(16))
156async fn read_symbol_table_node(
157    reader: &Arc<dyn AsyncFileReader>,
158    address: u64,
159    local_heap: &LocalHeap,
160    size_of_offsets: u8,
161    size_of_lengths: u8,
162) -> Result<Vec<SymbolTableEntry>> {
163    // Each symbol table entry: 2*O + 4 + 4 + 16 = 2*O + 24 bytes.
164    // Max 2K entries (K from superblock, typically 16-32).
165    // Conservative: fetch 16KB.
166    let fetch_size = 16384u64;
167    let data = reader.get_bytes(address..address + fetch_size).await?;
168    let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
169
170    // Verify signature
171    r.read_signature(b"SNOD")?;
172
173    let version = r.read_u8()?;
174    if version != 1 {
175        return Err(HDF5Error::General(format!(
176            "unsupported SNOD version: {version}"
177        )));
178    }
179
180    let _reserved = r.read_u8()?;
181    let num_symbols = r.read_u16()? as usize;
182
183    let mut entries = Vec::with_capacity(num_symbols);
184
185    for _ in 0..num_symbols {
186        let name_offset = r.read_offset()?;
187        let object_header_address = r.read_offset()?;
188        let cache_type = r.read_u32()?;
189        let _reserved = r.read_u32()?;
190
191        // 16 bytes scratch-pad space
192        let (child_btree_address, child_heap_address) = if cache_type == 1 {
193            // Group cache: B-tree address + heap address
194            let btree = r.read_offset()?;
195            let heap = r.read_offset()?;
196            // Skip remaining scratch-pad bytes (16 - 2*O)
197            let used = 2 * size_of_offsets as u64;
198            if used < 16 {
199                r.skip(16 - used);
200            }
201            (Some(btree), Some(heap))
202        } else {
203            r.skip(16); // unused scratch-pad
204            (None, None)
205        };
206
207        let name = local_heap.get_string(name_offset)?;
208
209        entries.push(SymbolTableEntry {
210            name,
211            object_header_address,
212            cache_type,
213            child_btree_address,
214            child_heap_address,
215        });
216    }
217
218    Ok(entries)
219}
220
221/// Traverse a v1 B-tree (type 1 — raw data chunks) and collect all chunk entries.
222///
223/// `ndims` is the dimensionality of the dataset (needed to parse chunk keys).
224pub async fn read_chunk_btree_v1(
225    reader: &Arc<dyn AsyncFileReader>,
226    btree_address: u64,
227    ndims: usize,
228    size_of_offsets: u8,
229    size_of_lengths: u8,
230) -> Result<Vec<ChunkEntry>> {
231    let mut entries = Vec::new();
232    read_chunk_btree_node(
233        reader,
234        btree_address,
235        ndims,
236        size_of_offsets,
237        size_of_lengths,
238        &mut entries,
239    )
240    .await?;
241    Ok(entries)
242}
243
244/// Recursively read a v1 B-tree node for chunk data.
245async fn read_chunk_btree_node(
246    reader: &Arc<dyn AsyncFileReader>,
247    address: u64,
248    ndims: usize,
249    size_of_offsets: u8,
250    size_of_lengths: u8,
251    entries: &mut Vec<ChunkEntry>,
252) -> Result<()> {
253    // Key size for type 1: 4 (size) + 4 (filter_mask) + (ndims+1)*8 (offsets)
254    let key_size = 8 + (ndims + 1) * 8;
255    // Fetch enough for a large node
256    let fetch_size = (8
257        + 2 * size_of_offsets as usize
258        + 1024 * (key_size + size_of_offsets as usize))
259        .min(65536) as u64;
260    let data = reader.get_bytes(address..address + fetch_size).await?;
261    let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
262
263    r.read_signature(b"TREE")?;
264
265    let node_type = r.read_u8()?;
266    if node_type != 1 {
267        return Err(HDF5Error::General(format!(
268            "expected B-tree v1 node type 1 (chunk), got {node_type}"
269        )));
270    }
271
272    let node_level = r.read_u8()?;
273    let entries_used = r.read_u16()? as usize;
274    let _left_sibling = r.read_offset()?;
275    let _right_sibling = r.read_offset()?;
276
277    if node_level > 0 {
278        // Internal node
279        for _ in 0..entries_used {
280            // Skip key
281            r.skip(key_size as u64);
282            let child_address = r.read_offset()?;
283            Box::pin(read_chunk_btree_node(
284                reader,
285                child_address,
286                ndims,
287                size_of_offsets,
288                size_of_lengths,
289                entries,
290            ))
291            .await?;
292        }
293        // Skip trailing key
294        r.skip(key_size as u64);
295    } else {
296        // Leaf node — collect chunk entries
297        for _ in 0..entries_used {
298            // Key: size(4) + filter_mask(4) + offsets((ndims+1) * 8)
299            let chunk_size = r.read_u32()?;
300            let filter_mask = r.read_u32()?;
301            let mut offsets = Vec::with_capacity(ndims);
302            for _ in 0..ndims {
303                offsets.push(r.read_u64()?);
304            }
305            let _zero = r.read_u64()?; // trailing dimension (always 0)
306
307            let child_address = r.read_offset()?;
308
309            // Skip entries with undefined address (unallocated chunks)
310            if !HDF5Reader::is_undef_addr(child_address, size_of_offsets) {
311                entries.push(ChunkEntry {
312                    size: chunk_size,
313                    filter_mask,
314                    offsets,
315                    address: child_address,
316                });
317            }
318        }
319        // Skip trailing key
320        r.skip(key_size as u64);
321    }
322
323    Ok(())
324}