Skip to main content

async_hdf5/
extensible_array.rs

1//! Extensible Array (EA) chunk index reader.
2//!
3//! The Extensible Array is used for chunked datasets where the dataspace has a
4//! single unlimited dimension. It provides efficient append-oriented storage
5//! with O(1) lookup via a pre-balanced hierarchical structure.
6//!
7//! Structures:
8//! - EAHD (Extensible Array Header) — configuration + pointer to index block
9//! - EAIB (Extensible Array Index Block) — inline elements + data/super block addresses
10//! - EASB (Extensible Array Super Block) — page bitmaps + data block addresses
11//! - EADB (Extensible Array Data Block) — chunk entries, optionally paged
12
13use std::sync::Arc;
14
15use crate::endian::HDF5Reader;
16use crate::error::{HDF5Error, Result};
17use crate::fixed_array::{parse_entries, read_n_byte_uint, FixedArrayChunkEntry};
18use crate::reader::AsyncFileReader;
19
20/// Pre-computed information for a single super block level.
21#[derive(Debug, Clone)]
22pub struct SBlockInfo {
23    /// Number of data blocks at this super block level.
24    pub ndblks: usize,
25    /// Number of elements in each data block at this level.
26    pub dblk_nelmts: usize,
27    /// Index of the first element covered by this super block level.
28    pub start_idx: u64,
29}
30
31/// Parsed Extensible Array header (EAHD).
32#[derive(Debug)]
33pub struct ExtensibleArrayHeader {
34    /// Client ID: 0 = non-filtered chunks, 1 = filtered chunks.
35    pub client_id: u8,
36    /// Size of each element in bytes.
37    pub element_size: u8,
38    /// Number of bits needed to store the max number of elements.
39    pub max_nelmts_bits: u8,
40    /// Number of elements stored directly in the index block.
41    pub idx_blk_elmts: u8,
42    /// Minimum number of elements per data block.
43    pub data_blk_min_elmts: u8,
44    /// Minimum number of data block pointers per super block.
45    pub sup_blk_min_data_ptrs: u8,
46    /// Log2 of max elements per data block page.
47    pub max_dblk_page_nelmts_bits: u8,
48    /// Address of the index block.
49    pub index_block_address: u64,
50
51    // Computed fields:
52    /// Total number of super block levels.
53    pub nsblks: usize,
54    /// Info for each super block level.
55    pub sblk_info: Vec<SBlockInfo>,
56    /// Number of elements per data block page (0 means no paging).
57    pub dblk_page_nelmts: usize,
58    /// Byte width of array offset fields in EASB/EADB.
59    pub arr_off_size: u8,
60}
61
62impl ExtensibleArrayHeader {
63    /// Read and parse an Extensible Array header from the given address.
64    pub async fn read(
65        reader: &Arc<dyn AsyncFileReader>,
66        address: u64,
67        size_of_offsets: u8,
68        size_of_lengths: u8,
69    ) -> Result<Self> {
70        // EAHD size: 4 (sig) + 1 (ver) + 1 (client) + 1 (elem_size)
71        //   + 1 (max_nelmts_bits) + 1 (idx_blk_elmts) + 1 (data_blk_min_elmts)
72        //   + 1 (sup_blk_min_data_ptrs) + 1 (max_dblk_page_nelmts_bits)
73        //   + 6*L (statistics) + O (index_block_address) + 4 (checksum)
74        let fetch_size = 12 + 6 * size_of_lengths as u64 + size_of_offsets as u64 + 4;
75        let data = reader.get_bytes(address..address + fetch_size).await?;
76        let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
77
78        r.read_signature(b"EAHD")?;
79        let version = r.read_u8()?;
80        if version != 0 {
81            return Err(HDF5Error::General(format!(
82                "unsupported Extensible Array header version: {version}"
83            )));
84        }
85
86        let client_id = r.read_u8()?;
87        let element_size = r.read_u8()?;
88        let max_nelmts_bits = r.read_u8()?;
89        let idx_blk_elmts = r.read_u8()?;
90        let data_blk_min_elmts = r.read_u8()?;
91        let sup_blk_min_data_ptrs = r.read_u8()?;
92        let max_dblk_page_nelmts_bits = r.read_u8()?;
93
94        // Skip 6 statistics fields (each is size_of_lengths)
95        r.skip(6 * size_of_lengths as u64);
96
97        let index_block_address = r.read_offset()?;
98        // Skip checksum
99
100        // Compute super block info table (mirrors H5EA__hdr_init)
101        let log2_min = log2_of_power_of_2(data_blk_min_elmts as u32);
102        let nsblks = 1 + (max_nelmts_bits as usize - log2_min as usize);
103        let dblk_page_nelmts = 1usize << max_dblk_page_nelmts_bits;
104        let arr_off_size = max_nelmts_bits.div_ceil(8);
105
106        let mut sblk_info = Vec::with_capacity(nsblks);
107        let mut start_idx = 0u64;
108        for s in 0..nsblks {
109            let ndblks = 1usize << (s / 2);
110            let dblk_nelmts = (1usize << s.div_ceil(2)) * data_blk_min_elmts as usize;
111            sblk_info.push(SBlockInfo {
112                ndblks,
113                dblk_nelmts,
114                start_idx,
115            });
116            start_idx += (ndblks as u64) * (dblk_nelmts as u64);
117        }
118
119        Ok(Self {
120            client_id,
121            element_size,
122            max_nelmts_bits,
123            idx_blk_elmts,
124            data_blk_min_elmts,
125            sup_blk_min_data_ptrs,
126            max_dblk_page_nelmts_bits,
127            index_block_address,
128            nsblks,
129            sblk_info,
130            dblk_page_nelmts,
131            arr_off_size,
132        })
133    }
134}
135
136/// A single entry with its flat index in the EA.
137pub(crate) struct IndexedEntry {
138    pub flat_idx: u64,
139    pub entry: FixedArrayChunkEntry,
140}
141
142/// Read all allocated chunk entries from an Extensible Array.
143///
144/// Returns `(flat_index, entry)` pairs for only allocated entries.
145/// Unallocated super blocks and data blocks are skipped entirely.
146pub(crate) async fn read_extensible_array_entries(
147    reader: &Arc<dyn AsyncFileReader>,
148    header: &ExtensibleArrayHeader,
149    size_of_offsets: u8,
150    size_of_lengths: u8,
151    uncompressed_chunk_size: u64,
152    layout_version: u8,
153) -> Result<Vec<IndexedEntry>> {
154    if HDF5Reader::is_undef_addr(header.index_block_address, size_of_offsets) {
155        return Ok(vec![]);
156    }
157
158    let is_filtered = header.client_id == 1;
159
160    // Compute index block layout (mirrors H5EA__iblock_alloc)
161    let nsblks_in_iblock = if header.sup_blk_min_data_ptrs <= 1 {
162        0usize
163    } else {
164        2 * log2_of_power_of_2(header.sup_blk_min_data_ptrs as u32) as usize
165    };
166    let ndblk_addrs = if header.sup_blk_min_data_ptrs <= 1 {
167        0usize
168    } else {
169        2 * (header.sup_blk_min_data_ptrs as usize - 1)
170    };
171    let nsblk_addrs = header.nsblks.saturating_sub(nsblks_in_iblock);
172
173    // Compute index block size to fetch
174    let iblock_size = 4 // signature
175        + 1 // version
176        + 1 // client_id
177        + size_of_offsets as usize // header address
178        + header.idx_blk_elmts as usize * header.element_size as usize // inline elements
179        + ndblk_addrs * size_of_offsets as usize // data block addresses
180        + nsblk_addrs * size_of_offsets as usize // super block addresses
181        + 4; // checksum
182
183    let data = reader
184        .get_bytes(header.index_block_address..header.index_block_address + iblock_size as u64)
185        .await?;
186    let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
187
188    // Parse EAIB header
189    r.read_signature(b"EAIB")?;
190    let version = r.read_u8()?;
191    if version != 0 {
192        return Err(HDF5Error::General(format!(
193            "unsupported Extensible Array index block version: {version}"
194        )));
195    }
196    let _client_id = r.read_u8()?;
197    let _header_address = r.read_offset()?;
198
199    // Read inline elements (flat indices 0..idx_blk_elmts)
200    let mut result = Vec::new();
201    if header.idx_blk_elmts > 0 {
202        let inline = parse_entries(
203            &mut r,
204            header.idx_blk_elmts as usize,
205            is_filtered,
206            size_of_offsets,
207            uncompressed_chunk_size,
208            layout_version,
209        )?;
210        for (i, entry) in inline.into_iter().enumerate() {
211            if !HDF5Reader::is_undef_addr(entry.address, size_of_offsets) {
212                result.push(IndexedEntry {
213                    flat_idx: i as u64,
214                    entry,
215                });
216            }
217        }
218    }
219
220    // Read data block addresses stored directly in the index block
221    let mut dblk_addrs = Vec::with_capacity(ndblk_addrs);
222    for _ in 0..ndblk_addrs {
223        dblk_addrs.push(r.read_offset()?);
224    }
225
226    // Read super block addresses
227    let mut sblk_addrs = Vec::with_capacity(nsblk_addrs);
228    for _ in 0..nsblk_addrs {
229        sblk_addrs.push(r.read_offset()?);
230    }
231
232    // Process data blocks stored directly in the index block.
233    // These correspond to the first `nsblks_in_iblock` super block levels.
234    // Flat index starts after inline elements.
235    let base_idx = header.idx_blk_elmts as u64;
236    let mut dblk_idx = 0usize;
237    for s in 0..nsblks_in_iblock.min(header.nsblks) {
238        let info = &header.sblk_info[s];
239        for d in 0..info.ndblks {
240            if dblk_idx < dblk_addrs.len() {
241                let addr = dblk_addrs[dblk_idx];
242                dblk_idx += 1;
243                let flat_start = base_idx + info.start_idx + (d as u64) * (info.dblk_nelmts as u64);
244                collect_data_block_entries(
245                    reader,
246                    addr,
247                    info.dblk_nelmts,
248                    flat_start,
249                    header,
250                    size_of_offsets,
251                    size_of_lengths,
252                    uncompressed_chunk_size,
253                    layout_version,
254                    None,
255                    &mut result,
256                )
257                .await?;
258            }
259        }
260    }
261
262    // Process super blocks (those stored by address in the index block)
263    for (sblk_rel_idx, &sblk_addr) in sblk_addrs.iter().enumerate() {
264        let sblk_idx = nsblks_in_iblock + sblk_rel_idx;
265        if sblk_idx >= header.nsblks {
266            break;
267        }
268        // Skip unallocated super blocks entirely
269        if HDF5Reader::is_undef_addr(sblk_addr, size_of_offsets) {
270            continue;
271        }
272
273        collect_super_block_entries(
274            reader,
275            sblk_addr,
276            sblk_idx,
277            base_idx,
278            header,
279            size_of_offsets,
280            size_of_lengths,
281            uncompressed_chunk_size,
282            layout_version,
283            &mut result,
284        )
285        .await?;
286    }
287
288    Ok(result)
289}
290
291/// Read an Extensible Array Super Block (EASB) and collect its data block entries.
292#[allow(clippy::too_many_arguments)]
293async fn collect_super_block_entries(
294    reader: &Arc<dyn AsyncFileReader>,
295    address: u64,
296    sblk_idx: usize,
297    base_idx: u64,
298    header: &ExtensibleArrayHeader,
299    size_of_offsets: u8,
300    size_of_lengths: u8,
301    uncompressed_chunk_size: u64,
302    layout_version: u8,
303    result: &mut Vec<IndexedEntry>,
304) -> Result<()> {
305    let info = &header.sblk_info[sblk_idx];
306
307    // Determine if data blocks at this level are paged
308    let dblk_npages = if info.dblk_nelmts > header.dblk_page_nelmts {
309        info.dblk_nelmts / header.dblk_page_nelmts
310    } else {
311        0
312    };
313    let dblk_page_init_size = if dblk_npages > 0 {
314        dblk_npages.div_ceil(8)
315    } else {
316        0
317    };
318
319    // Compute EASB size
320    let sblock_size = 4 // signature
321        + 1 // version
322        + 1 // client_id
323        + size_of_offsets as usize // header address
324        + header.arr_off_size as usize // block offset
325        + info.ndblks * dblk_page_init_size // page init bitmaps
326        + info.ndblks * size_of_offsets as usize // data block addresses
327        + 4; // checksum
328
329    let data = reader
330        .get_bytes(address..address + sblock_size as u64)
331        .await?;
332    let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
333
334    r.read_signature(b"EASB")?;
335    let version = r.read_u8()?;
336    if version != 0 {
337        return Err(HDF5Error::General(format!(
338            "unsupported Extensible Array super block version: {version}"
339        )));
340    }
341    let _client_id = r.read_u8()?;
342    let _header_address = r.read_offset()?;
343    let _block_offset = read_n_byte_uint(&mut r, header.arr_off_size)?;
344
345    // Read page init bitmaps (one per data block)
346    let mut page_init_bitmaps: Vec<Vec<u8>> = Vec::with_capacity(info.ndblks);
347    if dblk_page_init_size > 0 {
348        for _ in 0..info.ndblks {
349            let bitmap = r.read_bytes(dblk_page_init_size)?;
350            page_init_bitmaps.push(bitmap);
351        }
352    }
353
354    // Read data block addresses
355    let mut dblk_addrs = Vec::with_capacity(info.ndblks);
356    for _ in 0..info.ndblks {
357        dblk_addrs.push(r.read_offset()?);
358    }
359
360    // Read each data block
361    for (d, &dblk_addr) in dblk_addrs.iter().enumerate() {
362        let page_bitmap = if !page_init_bitmaps.is_empty() {
363            Some(&page_init_bitmaps[d])
364        } else {
365            None
366        };
367
368        let flat_start = base_idx + info.start_idx + (d as u64) * (info.dblk_nelmts as u64);
369        collect_data_block_entries(
370            reader,
371            dblk_addr,
372            info.dblk_nelmts,
373            flat_start,
374            header,
375            size_of_offsets,
376            size_of_lengths,
377            uncompressed_chunk_size,
378            layout_version,
379            page_bitmap,
380            result,
381        )
382        .await?;
383    }
384
385    Ok(())
386}
387
388/// Read an Extensible Array Data Block (EADB) and collect allocated entries.
389#[allow(clippy::too_many_arguments)]
390async fn collect_data_block_entries(
391    reader: &Arc<dyn AsyncFileReader>,
392    address: u64,
393    nelmts: usize,
394    flat_start: u64,
395    header: &ExtensibleArrayHeader,
396    size_of_offsets: u8,
397    size_of_lengths: u8,
398    uncompressed_chunk_size: u64,
399    layout_version: u8,
400    page_bitmap: Option<&Vec<u8>>,
401    result: &mut Vec<IndexedEntry>,
402) -> Result<()> {
403    if HDF5Reader::is_undef_addr(address, size_of_offsets) {
404        return Ok(()); // Skip unallocated data blocks
405    }
406
407    let is_filtered = header.client_id == 1;
408    let is_paged = nelmts > header.dblk_page_nelmts;
409
410    // EADB prefix size
411    let prefix_size = 4 // signature
412        + 1 // version
413        + 1 // client_id
414        + size_of_offsets as usize // header address
415        + header.arr_off_size as usize; // block offset
416
417    if !is_paged {
418        // Non-paged: elements are inline after prefix
419        let total_size = prefix_size + nelmts * header.element_size as usize + 4; // +4 checksum
420        let data = reader
421            .get_bytes(address..address + total_size as u64)
422            .await?;
423        let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
424
425        r.read_signature(b"EADB")?;
426        let version = r.read_u8()?;
427        if version != 0 {
428            return Err(HDF5Error::General(format!(
429                "unsupported Extensible Array data block version: {version}"
430            )));
431        }
432        let _client_id = r.read_u8()?;
433        let _header_address = r.read_offset()?;
434        let _block_offset = read_n_byte_uint(&mut r, header.arr_off_size)?;
435
436        let entries = parse_entries(
437            &mut r,
438            nelmts,
439            is_filtered,
440            size_of_offsets,
441            uncompressed_chunk_size,
442            layout_version,
443        )?;
444        for (i, entry) in entries.into_iter().enumerate() {
445            if !HDF5Reader::is_undef_addr(entry.address, size_of_offsets) {
446                result.push(IndexedEntry {
447                    flat_idx: flat_start + i as u64,
448                    entry,
449                });
450            }
451        }
452    } else {
453        // Paged: read prefix only (no inline elements), then pages follow
454        let prefix_total = prefix_size + 4; // +4 checksum
455        let data = reader
456            .get_bytes(address..address + prefix_total as u64)
457            .await?;
458        let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
459
460        r.read_signature(b"EADB")?;
461        let version = r.read_u8()?;
462        if version != 0 {
463            return Err(HDF5Error::General(format!(
464                "unsupported Extensible Array data block version: {version}"
465            )));
466        }
467        let _client_id = r.read_u8()?;
468        let _header_address = r.read_offset()?;
469        let _block_offset = read_n_byte_uint(&mut r, header.arr_off_size)?;
470
471        // Pages follow after the data block (prefix + checksum)
472        let npages = nelmts / header.dblk_page_nelmts;
473        let page_size = header.dblk_page_nelmts * header.element_size as usize + 4; // +4 checksum per page
474        let pages_start = address + prefix_total as u64;
475
476        for page_idx in 0..npages {
477            // Check page init bitmap if available
478            let page_initialized = match page_bitmap {
479                Some(bitmap) => {
480                    let byte_idx = page_idx / 8;
481                    let bit_idx = page_idx % 8;
482                    byte_idx < bitmap.len() && (bitmap[byte_idx] >> bit_idx) & 1 != 0
483                }
484                None => true,
485            };
486
487            if !page_initialized {
488                continue; // Skip uninitialized pages
489            }
490
491            let page_addr = pages_start + (page_idx as u64) * (page_size as u64);
492            let page_data = reader
493                .get_bytes(page_addr..page_addr + page_size as u64)
494                .await?;
495            let mut pr = HDF5Reader::with_sizes(page_data, size_of_offsets, size_of_lengths);
496
497            let page_entries = parse_entries(
498                &mut pr,
499                header.dblk_page_nelmts,
500                is_filtered,
501                size_of_offsets,
502                uncompressed_chunk_size,
503                layout_version,
504            )?;
505            let page_flat_start = flat_start + (page_idx * header.dblk_page_nelmts) as u64;
506            for (i, entry) in page_entries.into_iter().enumerate() {
507                if !HDF5Reader::is_undef_addr(entry.address, size_of_offsets) {
508                    result.push(IndexedEntry {
509                        flat_idx: page_flat_start + i as u64,
510                        entry,
511                    });
512                }
513            }
514        }
515    }
516
517    Ok(())
518}
519
520/// Compute log2 of a power of 2. Panics if `n` is not a power of 2.
521fn log2_of_power_of_2(n: u32) -> u32 {
522    debug_assert!(n > 0 && n.is_power_of_two(), "{n} is not a power of 2");
523    n.trailing_zeros()
524}