Skip to main content

hdf5_reader/
extensible_array.rs

1//! HDF5 Extensible Array (EA) chunk index.
2//!
3//! This is the default chunk index for datasets with one unlimited dimension
4//! and `libver='latest'`. It uses a three-level hierarchy:
5//!
6//! - `EAHD` — Extensible Array Header
7//! - `EAIB` — Extensible Array Index Block
8//! - `EADB` — Extensible Array Data Block
9//! - `EASB` — Extensible Array Secondary Block
10
11use crate::checksum::jenkins_lookup3;
12use crate::chunk_index::ChunkEntry;
13use crate::error::{Error, Result};
14use crate::io::Cursor;
15
16const EAHD_SIGNATURE: [u8; 4] = *b"EAHD";
17const EAIB_SIGNATURE: [u8; 4] = *b"EAIB";
18const EADB_SIGNATURE: [u8; 4] = *b"EADB";
19const EASB_SIGNATURE: [u8; 4] = *b"EASB";
20
21/// Parsed Extensible Array Header.
22#[derive(Debug)]
23struct EaHeader {
24    client_id: u8,
25    element_size: u8,
26    _max_nelmts_bits: u8,
27    idx_blk_elmts: u8,
28    data_blk_min_elmts: u8,
29    sec_blk_min_data_ptrs: u8,
30    max_dblk_page_nelmts_bits: u8,
31    _nelmts: u64,
32    index_block_address: u64,
33}
34
35/// Parse the Extensible Array Header.
36///
37/// On-disk layout (from H5EA_HEADER_SIZE):
38/// sig(4) + ver(1) + client_id(1) +
39/// element_size(1) + max_nelmts_bits(1) + idx_blk_elmts(1) +
40/// data_blk_min_elmts(1) + sec_blk_min_data_ptrs(1) + max_dblk_page_nelmts_bits(1) +
41/// 6 statistics fields (each length_size) +
42/// index_block_address(offset_size) + checksum(4)
43fn parse_header(data: &[u8], address: u64, offset_size: u8, length_size: u8) -> Result<EaHeader> {
44    let mut cursor = Cursor::new(data);
45    cursor.set_position(address);
46
47    let sig = cursor.read_bytes(4)?;
48    if sig != EAHD_SIGNATURE {
49        return Err(Error::InvalidExtensibleArraySignature {
50            context: "header signature mismatch",
51        });
52    }
53
54    let version = cursor.read_u8()?;
55    if version != 0 {
56        return Err(Error::Other(format!(
57            "unsupported extensible array header version {}",
58            version
59        )));
60    }
61
62    let client_id = cursor.read_u8()?;
63    let element_size = cursor.read_u8()?;
64    let max_nelmts_bits = cursor.read_u8()?;
65    let idx_blk_elmts = cursor.read_u8()?;
66    let data_blk_min_elmts = cursor.read_u8()?;
67    let sec_blk_min_data_ptrs = cursor.read_u8()?;
68    let max_dblk_page_nelmts_bits = cursor.read_u8()?;
69
70    // Statistics (6 fields, each length_size bytes)
71    let _nsuper_blks = cursor.read_length(length_size)?;
72    let _super_blk_size = cursor.read_length(length_size)?;
73    let _ndata_blks = cursor.read_length(length_size)?;
74    let _data_blk_size = cursor.read_length(length_size)?;
75    let _max_idx_set = cursor.read_length(length_size)?;
76    let nelmts = cursor.read_length(length_size)?;
77
78    let index_block_address = cursor.read_offset(offset_size)?;
79
80    // Checksum
81    let header_end = cursor.position();
82    let header_bytes = &data[address as usize..header_end as usize];
83    let stored_checksum = cursor.read_u32_le()?;
84    let computed = jenkins_lookup3(header_bytes);
85    if stored_checksum != computed {
86        return Err(Error::ChecksumMismatch {
87            expected: stored_checksum,
88            actual: computed,
89        });
90    }
91
92    Ok(EaHeader {
93        client_id,
94        element_size,
95        _max_nelmts_bits: max_nelmts_bits,
96        idx_blk_elmts,
97        data_blk_min_elmts,
98        sec_blk_min_data_ptrs,
99        max_dblk_page_nelmts_bits,
100        _nelmts: nelmts,
101        index_block_address,
102    })
103}
104
105/// Compute the super block layout.
106///
107/// Returns a vec of (elements_per_data_block, num_data_blocks) for each super block.
108/// Stops generating entries once cumulative capacity exceeds `nelmts`.
109fn compute_super_block_layout(header: &EaHeader) -> Vec<(u64, u64)> {
110    let mut layout = Vec::new();
111    let dblk_min = header.data_blk_min_elmts as u64;
112    let sblk_min = header.sec_blk_min_data_ptrs as u64;
113    let nelmts = header._nelmts;
114    let mut cumulative = header.idx_blk_elmts as u64;
115
116    for sb_idx in 0u32..64 {
117        if cumulative >= nelmts {
118            break;
119        }
120        let elmts_per_dblk = dblk_min * (1u64 << (sb_idx / 2));
121        let num_dblks = sblk_min * (1u64 << (sb_idx.div_ceil(2)));
122        layout.push((elmts_per_dblk, num_dblks));
123        cumulative += elmts_per_dblk * num_dblks;
124    }
125
126    layout
127}
128
129/// A single raw entry.
130struct EaRawEntry {
131    address: u64,
132    chunk_size: u64,
133    filter_mask: u32,
134}
135
136/// Read `count` entries from the cursor.
137fn read_entries(
138    cursor: &mut Cursor<'_>,
139    count: usize,
140    is_filtered: bool,
141    offset_size: u8,
142    entry_size: u8,
143) -> Result<Vec<EaRawEntry>> {
144    let mut entries = Vec::with_capacity(count);
145    for _ in 0..count {
146        let address = cursor.read_offset(offset_size)?;
147        let (chunk_size, filter_mask) = if is_filtered {
148            let chunk_size_len = entry_size
149                .checked_sub(offset_size)
150                .and_then(|remaining| remaining.checked_sub(4))
151                .ok_or_else(|| Error::InvalidData("invalid extensible array entry size".into()))?;
152            let cs = cursor.read_length(chunk_size_len)?;
153            let fm = cursor.read_u32_le()?;
154            (cs, fm)
155        } else {
156            (0, 0)
157        };
158        entries.push(EaRawEntry {
159            address,
160            chunk_size,
161            filter_mask,
162        });
163    }
164    Ok(entries)
165}
166
167/// Parse a data block and return its entries.
168///
169/// `sizeof_nelmts` is `ceil(max_nelmts_bits / 8)` — used for the block_off field.
170#[allow(clippy::too_many_arguments)]
171fn parse_data_block(
172    data: &[u8],
173    address: u64,
174    num_entries: usize,
175    is_filtered: bool,
176    max_page_bits: u8,
177    offset_size: u8,
178    entry_size: u8,
179    sizeof_nelmts: usize,
180) -> Result<Vec<EaRawEntry>> {
181    let mut cursor = Cursor::new(data);
182    cursor.set_position(address);
183
184    let sig = cursor.read_bytes(4)?;
185    if sig != EADB_SIGNATURE {
186        return Err(Error::InvalidExtensibleArraySignature {
187            context: "data block signature mismatch",
188        });
189    }
190
191    let version = cursor.read_u8()?;
192    if version != 0 {
193        return Err(Error::Other(format!(
194            "unsupported extensible array data block version {}",
195            version
196        )));
197    }
198
199    let _client_id = cursor.read_u8()?;
200    let _header_address = cursor.read_offset(offset_size)?;
201
202    // Block offset: sizeof_nelmts bytes indicating this block's element index offset.
203    cursor.skip(sizeof_nelmts)?;
204
205    // Paging is used only when nelmts exceeds 2^page_bits.
206    let page_nelmts = if max_page_bits > 0 {
207        1usize << max_page_bits
208    } else {
209        0
210    };
211
212    if page_nelmts > 0 && num_entries > page_nelmts {
213        // Paged data block
214        let num_pages = num_entries.div_ceil(page_nelmts);
215        let bitmap_bytes = num_pages.div_ceil(8);
216        let page_bitmap = cursor.read_bytes(bitmap_bytes)?.to_vec();
217
218        let mut all_entries = Vec::with_capacity(num_entries);
219        for page_idx in 0..num_pages {
220            let byte_idx = page_idx / 8;
221            let bit_idx = page_idx % 8;
222            let page_initialized =
223                byte_idx < page_bitmap.len() && (page_bitmap[byte_idx] & (1 << bit_idx)) != 0;
224
225            let entries_in_page = if page_idx == num_pages - 1 {
226                let remainder = num_entries % page_nelmts;
227                if remainder == 0 {
228                    page_nelmts
229                } else {
230                    remainder
231                }
232            } else {
233                page_nelmts
234            };
235
236            if page_initialized {
237                let page_entries = read_entries(
238                    &mut cursor,
239                    entries_in_page,
240                    is_filtered,
241                    offset_size,
242                    entry_size,
243                )?;
244                let _page_checksum = cursor.read_u32_le()?;
245                all_entries.extend(page_entries);
246            } else {
247                for _ in 0..entries_in_page {
248                    all_entries.push(EaRawEntry {
249                        address: u64::MAX,
250                        chunk_size: 0,
251                        filter_mask: 0,
252                    });
253                }
254            }
255        }
256        Ok(all_entries)
257    } else {
258        // Non-paged data block
259        let entries = read_entries(
260            &mut cursor,
261            num_entries,
262            is_filtered,
263            offset_size,
264            entry_size,
265        )?;
266        let _checksum = cursor.read_u32_le()?;
267        Ok(entries)
268    }
269}
270
271/// Parse a secondary block and return its data block addresses.
272fn parse_secondary_block(
273    data: &[u8],
274    address: u64,
275    num_dblk_addrs: usize,
276    offset_size: u8,
277    sizeof_nelmts: usize,
278    page_bitmap_bytes: usize,
279) -> Result<Vec<u64>> {
280    let mut cursor = Cursor::new(data);
281    cursor.set_position(address);
282
283    let sig = cursor.read_bytes(4)?;
284    if sig != EASB_SIGNATURE {
285        return Err(Error::InvalidExtensibleArraySignature {
286            context: "secondary block signature mismatch",
287        });
288    }
289
290    let version = cursor.read_u8()?;
291    if version != 0 {
292        return Err(Error::Other(format!(
293            "unsupported extensible array secondary block version {}",
294            version
295        )));
296    }
297
298    let _client_id = cursor.read_u8()?;
299    let _header_address = cursor.read_offset(offset_size)?;
300    cursor.skip(sizeof_nelmts)?;
301
302    if page_bitmap_bytes > 0 {
303        cursor.skip(page_bitmap_bytes)?;
304    }
305
306    let mut addrs = Vec::with_capacity(num_dblk_addrs);
307    for _ in 0..num_dblk_addrs {
308        addrs.push(cursor.read_offset(offset_size)?);
309    }
310
311    // Skip checksum
312    let _checksum = cursor.read_u32_le()?;
313
314    Ok(addrs)
315}
316
317fn read_entry_at(
318    data: &[u8],
319    position: u64,
320    is_filtered: bool,
321    offset_size: u8,
322    entry_size: u8,
323) -> Result<EaRawEntry> {
324    let mut cursor = Cursor::new(data);
325    cursor.set_position(position);
326    let mut entries = read_entries(&mut cursor, 1, is_filtered, offset_size, entry_size)?;
327    entries
328        .pop()
329        .ok_or_else(|| Error::InvalidData("missing extensible array entry".into()))
330}
331
332fn linear_target_offsets(
333    dataset_shape: &[u64],
334    chunk_dims: &[u32],
335    chunk_bounds: Option<(&[u64], &[u64])>,
336) -> Vec<(usize, Vec<u64>)> {
337    let ndim = dataset_shape.len();
338    let chunks_per_dim: Vec<u64> = (0..ndim)
339        .map(|i| dataset_shape[i].div_ceil(chunk_dims[i] as u64))
340        .collect();
341
342    if ndim == 0 {
343        return vec![(0, Vec::new())];
344    }
345
346    let (first_chunk, last_chunk): (Vec<u64>, Vec<u64>) = match chunk_bounds {
347        Some((first, last)) => (first.to_vec(), last.to_vec()),
348        None => (
349            vec![0u64; ndim],
350            chunks_per_dim
351                .iter()
352                .map(|count| count.saturating_sub(1))
353                .collect(),
354        ),
355    };
356
357    let mut targets = Vec::new();
358    let mut chunk_indices = first_chunk.clone();
359    loop {
360        let mut linear_idx = 0u64;
361        for (dim, chunk_index) in chunk_indices.iter().enumerate() {
362            linear_idx = linear_idx * chunks_per_dim[dim] + chunk_index;
363        }
364        let offsets = chunk_indices
365            .iter()
366            .enumerate()
367            .map(|(dim, chunk_index)| chunk_index * u64::from(chunk_dims[dim]))
368            .collect();
369        targets.push((linear_idx as usize, offsets));
370
371        let mut advanced = false;
372        for dim in (0..ndim).rev() {
373            if chunk_indices[dim] < last_chunk[dim] {
374                chunk_indices[dim] += 1;
375                if dim + 1 < ndim {
376                    chunk_indices[(dim + 1)..ndim].copy_from_slice(&first_chunk[(dim + 1)..ndim]);
377                }
378                advanced = true;
379                break;
380            }
381        }
382
383        if !advanced {
384            break;
385        }
386    }
387
388    targets
389}
390
391#[allow(clippy::too_many_arguments)]
392fn read_data_block_entry(
393    data: &[u8],
394    address: u64,
395    num_entries: usize,
396    local_idx: usize,
397    is_filtered: bool,
398    max_page_bits: u8,
399    offset_size: u8,
400    entry_size: u8,
401    sizeof_nelmts: usize,
402) -> Result<EaRawEntry> {
403    let mut cursor = Cursor::new(data);
404    cursor.set_position(address);
405
406    let sig = cursor.read_bytes(4)?;
407    if sig != EADB_SIGNATURE {
408        return Err(Error::InvalidExtensibleArraySignature {
409            context: "data block signature mismatch",
410        });
411    }
412
413    let version = cursor.read_u8()?;
414    if version != 0 {
415        return Err(Error::Other(format!(
416            "unsupported extensible array data block version {}",
417            version
418        )));
419    }
420
421    let _client_id = cursor.read_u8()?;
422    let _header_address = cursor.read_offset(offset_size)?;
423    cursor.skip(sizeof_nelmts)?;
424
425    let page_nelmts = if max_page_bits > 0 {
426        1usize << max_page_bits
427    } else {
428        0
429    };
430
431    if page_nelmts > 0 && num_entries > page_nelmts {
432        let num_pages = num_entries.div_ceil(page_nelmts);
433        let bitmap_bytes = num_pages.div_ceil(8);
434        let page_bitmap = cursor.read_bytes(bitmap_bytes)?.to_vec();
435        let data_start = cursor.position();
436
437        let target_page = local_idx / page_nelmts;
438        let within_page = local_idx % page_nelmts;
439        let byte_idx = target_page / 8;
440        let bit_idx = target_page % 8;
441        let page_initialized =
442            byte_idx < page_bitmap.len() && (page_bitmap[byte_idx] & (1 << bit_idx)) != 0;
443        if !page_initialized {
444            return Ok(EaRawEntry {
445                address: u64::MAX,
446                chunk_size: 0,
447                filter_mask: 0,
448            });
449        }
450
451        let mut page_start = data_start;
452        for page_idx in 0..target_page {
453            let entries_in_page = if page_idx == num_pages - 1 {
454                let remainder = num_entries % page_nelmts;
455                if remainder == 0 {
456                    page_nelmts
457                } else {
458                    remainder
459                }
460            } else {
461                page_nelmts
462            };
463            let page_byte_idx = page_idx / 8;
464            let page_bit_idx = page_idx % 8;
465            let initialized = page_byte_idx < page_bitmap.len()
466                && (page_bitmap[page_byte_idx] & (1 << page_bit_idx)) != 0;
467            if initialized {
468                page_start += (entries_in_page * entry_size as usize + 4) as u64;
469            }
470        }
471
472        let position = page_start + (within_page * entry_size as usize) as u64;
473        return read_entry_at(data, position, is_filtered, offset_size, entry_size);
474    }
475
476    let position = cursor.position() + (local_idx * entry_size as usize) as u64;
477    read_entry_at(data, position, is_filtered, offset_size, entry_size)
478}
479
480#[allow(clippy::too_many_arguments)]
481fn collect_extensible_array_chunk_entries_bounded(
482    data: &[u8],
483    header: &EaHeader,
484    offset_size: u8,
485    dataset_shape: &[u64],
486    chunk_dims: &[u32],
487    chunk_bounds: (&[u64], &[u64]),
488    sb_layout: &[(u64, u64)],
489    sizeof_nelmts: usize,
490) -> Result<Vec<ChunkEntry>> {
491    let is_filtered = header.client_id == 1;
492    let targets = linear_target_offsets(dataset_shape, chunk_dims, Some(chunk_bounds));
493
494    let mut cursor = Cursor::new(data);
495    cursor.set_position(header.index_block_address);
496
497    let sig = cursor.read_bytes(4)?;
498    if sig != EAIB_SIGNATURE {
499        return Err(Error::InvalidExtensibleArraySignature {
500            context: "index block signature mismatch",
501        });
502    }
503
504    let version = cursor.read_u8()?;
505    if version != 0 {
506        return Err(Error::Other(format!(
507            "unsupported extensible array index block version {}",
508            version
509        )));
510    }
511
512    let _client_id = cursor.read_u8()?;
513    let _header_address = cursor.read_offset(offset_size)?;
514
515    let num_inline = header.idx_blk_elmts as usize;
516    let inline_start = cursor.position();
517    cursor.skip(num_inline * header.element_size as usize)?;
518
519    let ndblk_addrs = 2 * header.sec_blk_min_data_ptrs as usize;
520    let mut direct_dblk_addrs = Vec::with_capacity(ndblk_addrs);
521    for _ in 0..ndblk_addrs {
522        direct_dblk_addrs.push(cursor.read_offset(offset_size)?);
523    }
524
525    let nsblks = sb_layout.len();
526    let nsblk_addrs = nsblks.saturating_sub(ndblk_addrs);
527    let mut sec_block_addrs = Vec::with_capacity(nsblk_addrs);
528    for _ in 0..nsblk_addrs {
529        sec_block_addrs.push(cursor.read_offset(offset_size)?);
530    }
531
532    let mut secondary_block_cache: Vec<Option<Vec<u64>>> = vec![None; sec_block_addrs.len()];
533    let mut entries = Vec::new();
534
535    for (linear_idx, offsets) in targets {
536        let raw = if linear_idx < num_inline {
537            read_entry_at(
538                data,
539                inline_start + (linear_idx * header.element_size as usize) as u64,
540                is_filtered,
541                offset_size,
542                header.element_size,
543            )?
544        } else {
545            let mut relative_idx = (linear_idx - num_inline) as u64;
546            let mut sb_idx = None;
547            for (candidate_idx, (elmts_per_dblk, num_dblks)) in sb_layout.iter().enumerate() {
548                let capacity = elmts_per_dblk * num_dblks;
549                if relative_idx < capacity {
550                    sb_idx = Some(candidate_idx);
551                    break;
552                }
553                relative_idx -= capacity;
554            }
555
556            let Some(sb_idx) = sb_idx else {
557                continue;
558            };
559            let (elmts_per_dblk, _) = sb_layout[sb_idx];
560            let dblk_idx = (relative_idx / elmts_per_dblk) as usize;
561            let local_idx = (relative_idx % elmts_per_dblk) as usize;
562
563            let dblk_addr = if sb_idx < 2 {
564                let base = sb_layout[..sb_idx]
565                    .iter()
566                    .map(|(_, num_dblks)| *num_dblks as usize)
567                    .sum::<usize>();
568                *direct_dblk_addrs.get(base + dblk_idx).unwrap_or(&u64::MAX)
569            } else {
570                let sec_cache_idx = sb_idx - 2;
571                if secondary_block_cache[sec_cache_idx].is_none() {
572                    let sec_addr = sec_block_addrs
573                        .get(sec_cache_idx)
574                        .copied()
575                        .unwrap_or(u64::MAX);
576                    if Cursor::is_undefined_offset(sec_addr, offset_size) {
577                        secondary_block_cache[sec_cache_idx] = Some(Vec::new());
578                    } else {
579                        let (_, num_dblks) = sb_layout[sb_idx];
580                        let page_bitmap_bytes = if header.max_dblk_page_nelmts_bits > 0
581                            && elmts_per_dblk > (1u64 << header.max_dblk_page_nelmts_bits)
582                        {
583                            let page_nelmts = 1usize << header.max_dblk_page_nelmts_bits;
584                            let pages_per_dblk = (elmts_per_dblk as usize).div_ceil(page_nelmts);
585                            (num_dblks as usize * pages_per_dblk).div_ceil(8)
586                        } else {
587                            0
588                        };
589                        secondary_block_cache[sec_cache_idx] = Some(parse_secondary_block(
590                            data,
591                            sec_addr,
592                            num_dblks as usize,
593                            offset_size,
594                            sizeof_nelmts,
595                            page_bitmap_bytes,
596                        )?);
597                    }
598                }
599
600                secondary_block_cache[sec_cache_idx]
601                    .as_ref()
602                    .and_then(|addrs| addrs.get(dblk_idx))
603                    .copied()
604                    .unwrap_or(u64::MAX)
605            };
606
607            if Cursor::is_undefined_offset(dblk_addr, offset_size) {
608                continue;
609            }
610
611            read_data_block_entry(
612                data,
613                dblk_addr,
614                elmts_per_dblk as usize,
615                local_idx,
616                is_filtered,
617                header.max_dblk_page_nelmts_bits,
618                offset_size,
619                header.element_size,
620                sizeof_nelmts,
621            )?
622        };
623
624        if Cursor::is_undefined_offset(raw.address, offset_size) {
625            continue;
626        }
627
628        entries.push(ChunkEntry {
629            address: raw.address,
630            size: raw.chunk_size,
631            filter_mask: raw.filter_mask,
632            offsets,
633        });
634    }
635
636    Ok(entries)
637}
638
639/// Collect chunk entries from an Extensible Array index.
640///
641/// Walks the EAHD → EAIB → (EADB / EASB → EADB) hierarchy and converts
642/// linear entry indices to multi-dimensional chunk offsets.
643pub fn collect_extensible_array_chunk_entries(
644    data: &[u8],
645    header_address: u64,
646    offset_size: u8,
647    length_size: u8,
648    dataset_shape: &[u64],
649    chunk_dims: &[u32],
650    chunk_bounds: Option<(&[u64], &[u64])>,
651) -> Result<Vec<ChunkEntry>> {
652    let header = parse_header(data, header_address, offset_size, length_size)?;
653
654    if Cursor::is_undefined_offset(header.index_block_address, offset_size) {
655        return Ok(Vec::new());
656    }
657
658    let is_filtered = header.client_id == 1;
659    let sb_layout = compute_super_block_layout(&header);
660    let sizeof_nelmts = (header._max_nelmts_bits as usize).div_ceil(8);
661
662    if let Some(bounds) = chunk_bounds {
663        return collect_extensible_array_chunk_entries_bounded(
664            data,
665            &header,
666            offset_size,
667            dataset_shape,
668            chunk_dims,
669            bounds,
670            &sb_layout,
671            sizeof_nelmts,
672        );
673    }
674
675    // Parse the index block.
676    let mut cursor = Cursor::new(data);
677    cursor.set_position(header.index_block_address);
678
679    let sig = cursor.read_bytes(4)?;
680    if sig != EAIB_SIGNATURE {
681        return Err(Error::InvalidExtensibleArraySignature {
682            context: "index block signature mismatch",
683        });
684    }
685
686    let version = cursor.read_u8()?;
687    if version != 0 {
688        return Err(Error::Other(format!(
689            "unsupported extensible array index block version {}",
690            version
691        )));
692    }
693
694    let _client_id = cursor.read_u8()?;
695    let _header_address = cursor.read_offset(offset_size)?;
696
697    // 1. Inline elements (idx_blk_elmts entries stored directly).
698    let num_inline = header.idx_blk_elmts as usize;
699    let inline_entries = read_entries(
700        &mut cursor,
701        num_inline,
702        is_filtered,
703        offset_size,
704        header.element_size,
705    )?;
706
707    // 2. Data block addresses stored directly in the index block.
708    // The number is 2 * sec_blk_min_data_ptrs (from HDF5: EA_IBLOCK_NDBLK_ADDRS).
709    let ndblk_addrs = 2 * header.sec_blk_min_data_ptrs as usize;
710    let mut direct_dblk_addrs = Vec::with_capacity(ndblk_addrs);
711    for _ in 0..ndblk_addrs {
712        direct_dblk_addrs.push(cursor.read_offset(offset_size)?);
713    }
714
715    // 3. Secondary block addresses for super blocks 2+.
716    // nsblk_addrs = max(0, nsblks - ndblk_addrs) where nsblks is the total
717    // number of super blocks needed to cover nelmts.
718    // compute_super_block_layout already stops once capacity >= nelmts,
719    // so sb_layout.len() is the total number of super blocks needed.
720    let nsblks = sb_layout.len();
721
722    let nsblk_addrs = nsblks.saturating_sub(ndblk_addrs);
723    let mut sec_block_addrs = Vec::with_capacity(nsblk_addrs);
724    for _ in 0..nsblk_addrs {
725        sec_block_addrs.push(cursor.read_offset(offset_size)?);
726    }
727
728    // Skip checksum at end of index block
729    let _checksum = cursor.read_u32_le()?;
730
731    // Now collect all entries.
732    let mut all_entries: Vec<EaRawEntry> = Vec::new();
733
734    // Inline entries
735    all_entries.extend(inline_entries);
736
737    // Data blocks from direct addresses (super blocks 0-1)
738    let mut dblk_addr_idx = 0;
739    for sb_idx_iter in 0..2usize.min(nsblks) {
740        if sb_idx_iter >= sb_layout.len() {
741            break;
742        }
743        let (elmts_per_dblk, num_dblks) = sb_layout[sb_idx_iter];
744        for _ in 0..num_dblks {
745            if dblk_addr_idx >= direct_dblk_addrs.len() {
746                break;
747            }
748            let dblk_addr = direct_dblk_addrs[dblk_addr_idx];
749            dblk_addr_idx += 1;
750
751            if Cursor::is_undefined_offset(dblk_addr, offset_size) {
752                for _ in 0..elmts_per_dblk {
753                    all_entries.push(EaRawEntry {
754                        address: u64::MAX,
755                        chunk_size: 0,
756                        filter_mask: 0,
757                    });
758                }
759            } else {
760                let dblk_entries = parse_data_block(
761                    data,
762                    dblk_addr,
763                    elmts_per_dblk as usize,
764                    is_filtered,
765                    header.max_dblk_page_nelmts_bits,
766                    offset_size,
767                    header.element_size,
768                    sizeof_nelmts,
769                )?;
770                all_entries.extend(dblk_entries);
771            }
772        }
773    }
774
775    // Data blocks from super blocks 2+ (via secondary blocks)
776    for (sec_idx, &sec_addr) in sec_block_addrs.iter().enumerate() {
777        let sb_idx_iter = sec_idx + 2;
778        if sb_idx_iter >= sb_layout.len() {
779            break;
780        }
781        let (elmts_per_dblk, num_dblks) = sb_layout[sb_idx_iter];
782
783        if Cursor::is_undefined_offset(sec_addr, offset_size) {
784            for _ in 0..(elmts_per_dblk * num_dblks) {
785                all_entries.push(EaRawEntry {
786                    address: u64::MAX,
787                    chunk_size: 0,
788                    filter_mask: 0,
789                });
790            }
791            continue;
792        }
793
794        // Per HDF5 spec III.H "Extensible Array Secondary Block", the secondary
795        // block contains a page initialization bitmap when data blocks are paged.
796        // Bitmap size = ceil(num_dblks * pages_per_dblk / 8).
797        let page_bitmap_bytes = if header.max_dblk_page_nelmts_bits > 0
798            && elmts_per_dblk > (1u64 << header.max_dblk_page_nelmts_bits)
799        {
800            let page_nelmts = 1usize << header.max_dblk_page_nelmts_bits;
801            let pages_per_dblk = (elmts_per_dblk as usize).div_ceil(page_nelmts);
802            (num_dblks as usize * pages_per_dblk).div_ceil(8)
803        } else {
804            0
805        };
806        let dblk_addrs = parse_secondary_block(
807            data,
808            sec_addr,
809            num_dblks as usize,
810            offset_size,
811            sizeof_nelmts,
812            page_bitmap_bytes,
813        )?;
814
815        for &dblk_addr in &dblk_addrs {
816            if Cursor::is_undefined_offset(dblk_addr, offset_size) {
817                for _ in 0..elmts_per_dblk {
818                    all_entries.push(EaRawEntry {
819                        address: u64::MAX,
820                        chunk_size: 0,
821                        filter_mask: 0,
822                    });
823                }
824            } else {
825                let dblk_entries = parse_data_block(
826                    data,
827                    dblk_addr,
828                    elmts_per_dblk as usize,
829                    is_filtered,
830                    header.max_dblk_page_nelmts_bits,
831                    offset_size,
832                    header.element_size,
833                    sizeof_nelmts,
834                )?;
835                all_entries.extend(dblk_entries);
836            }
837        }
838    }
839
840    // Convert linear indices to chunk offsets.
841    let ndim = dataset_shape.len();
842    let chunks_per_dim: Vec<u64> = (0..ndim)
843        .map(|i| dataset_shape[i].div_ceil(chunk_dims[i] as u64))
844        .collect();
845
846    let mut entries = Vec::new();
847    for (linear_idx, raw) in all_entries.iter().enumerate() {
848        if Cursor::is_undefined_offset(raw.address, offset_size) {
849            continue;
850        }
851
852        let mut remaining = linear_idx as u64;
853        let mut offsets = vec![0u64; ndim];
854        for d in (0..ndim).rev() {
855            offsets[d] = (remaining % chunks_per_dim[d]) * chunk_dims[d] as u64;
856            remaining /= chunks_per_dim[d];
857        }
858
859        if let Some((first_chunk, last_chunk)) = chunk_bounds {
860            let overlaps = offsets.iter().enumerate().all(|(dim, offset)| {
861                let chunk_index = *offset / u64::from(chunk_dims[dim]);
862                chunk_index >= first_chunk[dim] && chunk_index <= last_chunk[dim]
863            });
864            if !overlaps {
865                continue;
866            }
867        }
868
869        entries.push(ChunkEntry {
870            address: raw.address,
871            size: raw.chunk_size,
872            filter_mask: raw.filter_mask,
873            offsets,
874        });
875    }
876
877    Ok(entries)
878}
879
880#[cfg(test)]
881mod tests {
882    use super::*;
883
884    #[test]
885    fn test_eahd_bad_signature() {
886        let mut data = vec![0u8; 64];
887        data[0..4].copy_from_slice(b"XXXX");
888        let err = parse_header(&data, 0, 8, 8).unwrap_err();
889        assert!(matches!(err, Error::InvalidExtensibleArraySignature { .. }));
890    }
891
892    #[test]
893    fn test_compute_super_block_layout() {
894        let header = EaHeader {
895            client_id: 0,
896            element_size: 8,
897            _max_nelmts_bits: 32,
898            idx_blk_elmts: 2,
899            data_blk_min_elmts: 2,
900            sec_blk_min_data_ptrs: 2,
901            max_dblk_page_nelmts_bits: 0,
902            _nelmts: 100,
903            index_block_address: 0,
904        };
905        let layout = compute_super_block_layout(&header);
906        // sb 0: elmts_per_dblk = 2 * 2^0 = 2, num_dblks = 2 * 2^0 = 2  (cap = 4 elements)
907        assert_eq!(layout[0], (2, 2));
908        // sb 1: elmts_per_dblk = 2 * 2^0 = 2, num_dblks = 2 * 2^1 = 4  (cap = 8 elements)
909        assert_eq!(layout[1], (2, 4));
910        // sb 2: elmts_per_dblk = 2 * 2^1 = 4, num_dblks = 2 * 2^1 = 4  (cap = 16 elements)
911        assert_eq!(layout[2], (4, 4));
912        // sb 3: elmts_per_dblk = 2 * 2^1 = 4, num_dblks = 2 * 2^2 = 8  (cap = 32 elements)
913        assert_eq!(layout[3], (4, 8));
914    }
915}