Skip to main content

hdf5_reader/
fixed_array.rs

1//! HDF5 Fixed Array (FA) chunk index.
2//!
3//! This is the default chunk index for fixed-size chunked datasets created with
4//! `libver='latest'`. It stores chunk entries in a flat array (optionally paged)
5//! with a single-level header → data block structure.
6//!
7//! Structures:
8//! - `FAHD` — Fixed Array Header
9//! - `FADB` — Fixed Array Data Block
10
11use crate::checksum::jenkins_lookup3;
12use crate::chunk_index::ChunkEntry;
13use crate::error::{Error, Result};
14use crate::io::Cursor;
15
16const FAHD_SIGNATURE: [u8; 4] = *b"FAHD";
17const FADB_SIGNATURE: [u8; 4] = *b"FADB";
18
19/// Parsed Fixed Array Header.
20#[derive(Debug)]
21struct FaHeader {
22    client_id: u8,
23    entry_size: u8,
24    page_bits: u8,
25    num_entries: u64,
26    data_block_address: u64,
27}
28
29/// Parse the Fixed Array Header at the given address.
30///
31/// On-disk layout (from H5FA_HEADER_SIZE):
32/// sig(4) + ver(1) + client_id(1) + entry_size(1) + page_bits(1)
33/// + nelmts(length_size) + dblk_addr(offset_size) + checksum(4)
34fn parse_header(data: &[u8], address: u64, offset_size: u8, length_size: u8) -> Result<FaHeader> {
35    let mut cursor = Cursor::new(data);
36    cursor.set_position(address);
37
38    let sig = cursor.read_bytes(4)?;
39    if sig != FAHD_SIGNATURE {
40        return Err(Error::InvalidFixedArraySignature {
41            context: "header signature mismatch",
42        });
43    }
44
45    let version = cursor.read_u8()?;
46    if version != 0 {
47        return Err(Error::Other(format!(
48            "unsupported fixed array header version {}",
49            version
50        )));
51    }
52
53    let client_id = cursor.read_u8()?;
54    let entry_size = cursor.read_u8()?;
55    let page_bits = cursor.read_u8()?;
56    let num_entries = cursor.read_length(length_size)?;
57    let data_block_address = cursor.read_offset(offset_size)?;
58
59    // Checksum covers everything from signature through data_block_address.
60    let header_end = cursor.position();
61    let header_bytes = &data[address as usize..header_end as usize];
62    let stored_checksum = cursor.read_u32_le()?;
63    let computed = jenkins_lookup3(header_bytes);
64    if stored_checksum != computed {
65        return Err(Error::ChecksumMismatch {
66            expected: stored_checksum,
67            actual: computed,
68        });
69    }
70
71    Ok(FaHeader {
72        client_id,
73        entry_size,
74        page_bits,
75        num_entries,
76        data_block_address,
77    })
78}
79
80/// A single raw fixed-array entry (before conversion to ChunkEntry).
81#[derive(Debug)]
82struct FaRawEntry {
83    address: u64,
84    chunk_size: u64,
85    filter_mask: u32,
86}
87
88/// Read entries from a Fixed Array Data Block.
89fn parse_data_block(
90    data: &[u8],
91    address: u64,
92    header: &FaHeader,
93    offset_size: u8,
94) -> Result<Vec<FaRawEntry>> {
95    let mut cursor = Cursor::new(data);
96    cursor.set_position(address);
97
98    let sig = cursor.read_bytes(4)?;
99    if sig != FADB_SIGNATURE {
100        return Err(Error::InvalidFixedArraySignature {
101            context: "data block signature mismatch",
102        });
103    }
104
105    let version = cursor.read_u8()?;
106    if version != 0 {
107        return Err(Error::Other(format!(
108            "unsupported fixed array data block version {}",
109            version
110        )));
111    }
112
113    let _client_id = cursor.read_u8()?;
114    let _header_address = cursor.read_offset(offset_size)?;
115
116    let num_entries = header.num_entries as usize;
117    let is_filtered = header.client_id == 1;
118
119    // Paging is used only when nelmts exceeds 2^page_bits.
120    let use_paging = header.page_bits > 0 && num_entries > (1usize << header.page_bits);
121
122    if !use_paging {
123        // Non-paged: all entries inline followed by a single checksum.
124        let entries = read_entries(
125            &mut cursor,
126            num_entries,
127            is_filtered,
128            offset_size,
129            header.entry_size,
130        )?;
131        // Skip the trailing checksum (already verified structurally).
132        let _checksum = cursor.read_u32_le()?;
133        Ok(entries)
134    } else {
135        // Paged: entries are split into pages of `2^page_bits` entries each.
136        let entries_per_page = 1usize << header.page_bits;
137        let num_pages = num_entries.div_ceil(entries_per_page);
138
139        // Page init bitmap: ceil(num_pages / 8) bytes — tells which pages
140        // have been initialized. We read all pages regardless (uninitialized
141        // pages have undefined addresses that we filter out later).
142        let bitmap_bytes = num_pages.div_ceil(8);
143        let page_bitmap = cursor.read_bytes(bitmap_bytes)?.to_vec();
144
145        let mut all_entries = Vec::with_capacity(num_entries);
146
147        for page_idx in 0..num_pages {
148            let byte_idx = page_idx / 8;
149            let bit_idx = page_idx % 8;
150            let page_initialized =
151                byte_idx < page_bitmap.len() && (page_bitmap[byte_idx] & (1 << bit_idx)) != 0;
152
153            let entries_in_this_page = if page_idx == num_pages - 1 {
154                let remainder = num_entries % entries_per_page;
155                if remainder == 0 {
156                    entries_per_page
157                } else {
158                    remainder
159                }
160            } else {
161                entries_per_page
162            };
163
164            if page_initialized {
165                let page_entries = read_entries(
166                    &mut cursor,
167                    entries_in_this_page,
168                    is_filtered,
169                    offset_size,
170                    header.entry_size,
171                )?;
172                // Each page has its own checksum.
173                let _page_checksum = cursor.read_u32_le()?;
174                all_entries.extend(page_entries);
175            } else {
176                // Uninitialized page — fill with undefined entries.
177                for _ in 0..entries_in_this_page {
178                    all_entries.push(FaRawEntry {
179                        address: u64::MAX,
180                        chunk_size: 0,
181                        filter_mask: 0,
182                    });
183                }
184            }
185        }
186
187        Ok(all_entries)
188    }
189}
190
191/// Read `count` entries from the cursor.
192fn read_entries(
193    cursor: &mut Cursor<'_>,
194    count: usize,
195    is_filtered: bool,
196    offset_size: u8,
197    entry_size: u8,
198) -> Result<Vec<FaRawEntry>> {
199    let mut entries = Vec::with_capacity(count);
200    for _ in 0..count {
201        let address = cursor.read_offset(offset_size)?;
202        let (chunk_size, filter_mask) = if is_filtered {
203            let chunk_size_len = entry_size
204                .checked_sub(offset_size)
205                .and_then(|remaining| remaining.checked_sub(4))
206                .ok_or_else(|| Error::InvalidData("invalid fixed array entry size".into()))?;
207            let cs = cursor.read_length(chunk_size_len)?;
208            let fm = cursor.read_u32_le()?;
209            (cs, fm)
210        } else {
211            (0, 0)
212        };
213        entries.push(FaRawEntry {
214            address,
215            chunk_size,
216            filter_mask,
217        });
218    }
219    Ok(entries)
220}
221
222fn read_entry_at(
223    data: &[u8],
224    position: u64,
225    is_filtered: bool,
226    offset_size: u8,
227    entry_size: u8,
228) -> Result<FaRawEntry> {
229    let mut cursor = Cursor::new(data);
230    cursor.set_position(position);
231    let mut entries = read_entries(&mut cursor, 1, is_filtered, offset_size, entry_size)?;
232    entries
233        .pop()
234        .ok_or_else(|| Error::InvalidData("missing fixed array entry".into()))
235}
236
237fn linear_target_offsets(
238    dataset_shape: &[u64],
239    chunk_dims: &[u32],
240    chunk_bounds: Option<(&[u64], &[u64])>,
241) -> Vec<(usize, Vec<u64>)> {
242    let ndim = dataset_shape.len();
243    let chunks_per_dim: Vec<u64> = (0..ndim)
244        .map(|i| dataset_shape[i].div_ceil(chunk_dims[i] as u64))
245        .collect();
246
247    if ndim == 0 {
248        return vec![(0, Vec::new())];
249    }
250
251    let (first_chunk, last_chunk): (Vec<u64>, Vec<u64>) = match chunk_bounds {
252        Some((first, last)) => (first.to_vec(), last.to_vec()),
253        None => (
254            vec![0u64; ndim],
255            chunks_per_dim
256                .iter()
257                .map(|count| count.saturating_sub(1))
258                .collect(),
259        ),
260    };
261
262    let mut targets = Vec::new();
263    let mut chunk_indices = first_chunk.clone();
264    loop {
265        let mut linear_idx = 0u64;
266        for (dim, chunk_index) in chunk_indices.iter().enumerate() {
267            linear_idx = linear_idx * chunks_per_dim[dim] + chunk_index;
268        }
269        let offsets = chunk_indices
270            .iter()
271            .enumerate()
272            .map(|(dim, chunk_index)| chunk_index * u64::from(chunk_dims[dim]))
273            .collect();
274        targets.push((linear_idx as usize, offsets));
275
276        let mut advanced = false;
277        for dim in (0..ndim).rev() {
278            if chunk_indices[dim] < last_chunk[dim] {
279                chunk_indices[dim] += 1;
280                if dim + 1 < ndim {
281                    chunk_indices[(dim + 1)..ndim].copy_from_slice(&first_chunk[(dim + 1)..ndim]);
282                }
283                advanced = true;
284                break;
285            }
286        }
287
288        if !advanced {
289            break;
290        }
291    }
292
293    targets
294}
295
296fn collect_fixed_array_chunk_entries_bounded(
297    data: &[u8],
298    header: &FaHeader,
299    offset_size: u8,
300    dataset_shape: &[u64],
301    chunk_dims: &[u32],
302    chunk_bounds: (&[u64], &[u64]),
303) -> Result<Vec<ChunkEntry>> {
304    let targets = linear_target_offsets(dataset_shape, chunk_dims, Some(chunk_bounds));
305    let mut cursor = Cursor::new(data);
306    cursor.set_position(header.data_block_address);
307
308    let sig = cursor.read_bytes(4)?;
309    if sig != FADB_SIGNATURE {
310        return Err(Error::InvalidFixedArraySignature {
311            context: "data block signature mismatch",
312        });
313    }
314
315    let version = cursor.read_u8()?;
316    if version != 0 {
317        return Err(Error::Other(format!(
318            "unsupported fixed array data block version {}",
319            version
320        )));
321    }
322
323    let _client_id = cursor.read_u8()?;
324    let _header_address = cursor.read_offset(offset_size)?;
325
326    let num_entries = header.num_entries as usize;
327    let is_filtered = header.client_id == 1;
328    let entry_bytes = header.entry_size as usize;
329    let use_paging = header.page_bits > 0 && num_entries > (1usize << header.page_bits);
330
331    if !use_paging {
332        let entries_start = cursor.position();
333        let mut entries = Vec::new();
334        for (linear_idx, offsets) in targets {
335            let position = entries_start + (linear_idx * entry_bytes) as u64;
336            let raw = read_entry_at(data, position, is_filtered, offset_size, header.entry_size)?;
337            if Cursor::is_undefined_offset(raw.address, offset_size) {
338                continue;
339            }
340            entries.push(ChunkEntry {
341                address: raw.address,
342                size: raw.chunk_size,
343                filter_mask: raw.filter_mask,
344                offsets,
345            });
346        }
347        return Ok(entries);
348    }
349
350    let entries_per_page = 1usize << header.page_bits;
351    let num_pages = num_entries.div_ceil(entries_per_page);
352    let bitmap_bytes = num_pages.div_ceil(8);
353    let page_bitmap = cursor.read_bytes(bitmap_bytes)?.to_vec();
354    let pages_start = cursor.position();
355
356    let mut page_offsets = vec![None; num_pages];
357    let mut next_page_start = pages_start;
358    for (page_idx, page_offset) in page_offsets.iter_mut().enumerate().take(num_pages) {
359        let byte_idx = page_idx / 8;
360        let bit_idx = page_idx % 8;
361        let page_initialized =
362            byte_idx < page_bitmap.len() && (page_bitmap[byte_idx] & (1 << bit_idx)) != 0;
363
364        let entries_in_page = if page_idx == num_pages - 1 {
365            let remainder = num_entries % entries_per_page;
366            if remainder == 0 {
367                entries_per_page
368            } else {
369                remainder
370            }
371        } else {
372            entries_per_page
373        };
374
375        if page_initialized {
376            *page_offset = Some(next_page_start);
377            next_page_start += (entries_in_page * entry_bytes + 4) as u64;
378        }
379    }
380
381    let mut entries = Vec::new();
382    for (linear_idx, offsets) in targets {
383        let page_idx = linear_idx / entries_per_page;
384        let within_page = linear_idx % entries_per_page;
385        let Some(page_start) = page_offsets[page_idx] else {
386            continue;
387        };
388        let position = page_start + (within_page * entry_bytes) as u64;
389        let raw = read_entry_at(data, position, is_filtered, offset_size, header.entry_size)?;
390        if Cursor::is_undefined_offset(raw.address, offset_size) {
391            continue;
392        }
393        entries.push(ChunkEntry {
394            address: raw.address,
395            size: raw.chunk_size,
396            filter_mask: raw.filter_mask,
397            offsets,
398        });
399    }
400
401    Ok(entries)
402}
403
404/// Collect chunk entries from a Fixed Array index.
405///
406/// Reads the FAHD header and FADB data block, then converts linear entry
407/// indices to multi-dimensional chunk offsets.
408pub fn collect_fixed_array_chunk_entries(
409    data: &[u8],
410    header_address: u64,
411    offset_size: u8,
412    length_size: u8,
413    dataset_shape: &[u64],
414    chunk_dims: &[u32],
415    chunk_bounds: Option<(&[u64], &[u64])>,
416) -> Result<Vec<ChunkEntry>> {
417    let header = parse_header(data, header_address, offset_size, length_size)?;
418
419    if Cursor::is_undefined_offset(header.data_block_address, offset_size) {
420        return Ok(Vec::new());
421    }
422
423    if let Some(bounds) = chunk_bounds {
424        return collect_fixed_array_chunk_entries_bounded(
425            data,
426            &header,
427            offset_size,
428            dataset_shape,
429            chunk_dims,
430            bounds,
431        );
432    }
433
434    let raw_entries = parse_data_block(data, header.data_block_address, &header, offset_size)?;
435
436    let ndim = dataset_shape.len();
437    let chunks_per_dim: Vec<u64> = (0..ndim)
438        .map(|i| dataset_shape[i].div_ceil(chunk_dims[i] as u64))
439        .collect();
440
441    let mut entries = Vec::new();
442    for (linear_idx, raw) in raw_entries.iter().enumerate() {
443        // Skip undefined addresses (unallocated chunks).
444        if Cursor::is_undefined_offset(raw.address, offset_size) {
445            continue;
446        }
447
448        // Convert linear index to multi-dimensional chunk offsets.
449        let mut remaining = linear_idx as u64;
450        let mut offsets = vec![0u64; ndim];
451        for d in (0..ndim).rev() {
452            offsets[d] = (remaining % chunks_per_dim[d]) * chunk_dims[d] as u64;
453            remaining /= chunks_per_dim[d];
454        }
455
456        if let Some((first_chunk, last_chunk)) = chunk_bounds {
457            let overlaps = offsets.iter().enumerate().all(|(dim, offset)| {
458                let chunk_index = *offset / u64::from(chunk_dims[dim]);
459                chunk_index >= first_chunk[dim] && chunk_index <= last_chunk[dim]
460            });
461            if !overlaps {
462                continue;
463            }
464        }
465
466        entries.push(ChunkEntry {
467            address: raw.address,
468            size: raw.chunk_size,
469            filter_mask: raw.filter_mask,
470            offsets,
471        });
472    }
473
474    Ok(entries)
475}
476
477#[cfg(test)]
478mod tests {
479    use super::*;
480
481    #[test]
482    fn test_fahd_bad_signature() {
483        let mut data = vec![0u8; 64];
484        data[0..4].copy_from_slice(b"XXXX");
485        let err = parse_header(&data, 0, 8, 8).unwrap_err();
486        assert!(matches!(err, Error::InvalidFixedArraySignature { .. }));
487    }
488
489    #[test]
490    fn test_fadb_bad_signature() {
491        let header = FaHeader {
492            client_id: 0,
493            entry_size: 8,
494            page_bits: 0,
495            num_entries: 1,
496            data_block_address: 0,
497        };
498        let mut data = vec![0u8; 64];
499        data[0..4].copy_from_slice(b"XXXX");
500        let err = parse_data_block(&data, 0, &header, 8).unwrap_err();
501        assert!(matches!(err, Error::InvalidFixedArraySignature { .. }));
502    }
503}