Skip to main content

async_hdf5/
fixed_array.rs

1//! Fixed Array (FARRAY) chunk index reader.
2//!
3//! The Fixed Array is used for chunked datasets where all dimensions are fixed
4//! (no unlimited dimensions). It provides O(1) chunk lookup.
5//!
6//! Structures:
7//! - FAHD (Fixed Array Header) — metadata + pointer to data block
8//! - FADB (Fixed Array Data Block) — array of chunk entries, optionally paged
9
10use std::sync::Arc;
11
12use crate::endian::HDF5Reader;
13use crate::error::{HDF5Error, Result};
14use crate::reader::AsyncFileReader;
15
16/// Parsed Fixed Array header (FAHD).
17#[derive(Debug)]
18pub struct FixedArrayHeader {
19    /// Client ID: 0 = non-filtered chunks, 1 = filtered chunks.
20    pub client_id: u8,
21    /// Size of each entry in bytes.
22    pub entry_size: u8,
23    /// Page bits — threshold for pagination (pages used if max_entries >= 2^page_bits).
24    pub page_bits: u8,
25    /// Maximum number of entries (total chunks).
26    pub max_num_entries: u64,
27    /// Address of the data block (FADB).
28    pub data_block_address: u64,
29}
30
31/// A single chunk entry from a fixed or extensible array.
32#[derive(Debug, Clone)]
33pub(crate) struct FixedArrayChunkEntry {
34    /// File byte offset of the chunk.
35    pub address: u64,
36    /// Chunk size in bytes (compressed/on-disk). For non-filtered, this is computed.
37    pub chunk_size: u64,
38    /// Filter mask (0 for non-filtered).
39    pub filter_mask: u32,
40}
41
42impl FixedArrayHeader {
43    /// Read and parse a Fixed Array header from the given address.
44    pub async fn read(
45        reader: &Arc<dyn AsyncFileReader>,
46        address: u64,
47        size_of_offsets: u8,
48        size_of_lengths: u8,
49    ) -> Result<Self> {
50        // FAHD is small: 4 (sig) + 1 (ver) + 1 (client) + 1 (entry_size) + 1 (page_bits)
51        //   + L (max_entries) + O (data_block_addr) + 4 (checksum)
52        let fetch_size = (12 + size_of_offsets + size_of_lengths) as u64;
53        let data = reader.get_bytes(address..address + fetch_size).await?;
54        let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
55
56        r.read_signature(b"FAHD")?;
57        let version = r.read_u8()?;
58        if version != 0 {
59            return Err(HDF5Error::General(format!(
60                "unsupported Fixed Array header version: {version}"
61            )));
62        }
63
64        let client_id = r.read_u8()?;
65        let entry_size = r.read_u8()?;
66        let page_bits = r.read_u8()?;
67        let max_num_entries = r.read_length()?;
68        let data_block_address = r.read_offset()?;
69        // Skip checksum
70
71        Ok(Self {
72            client_id,
73            entry_size,
74            page_bits,
75            max_num_entries,
76            data_block_address,
77        })
78    }
79}
80
81/// Read all chunk entries from a Fixed Array.
82///
83/// `layout_version` is the data layout message version (4 or 5) — this affects
84/// how filtered chunk entries are encoded.
85pub(crate) async fn read_fixed_array_entries(
86    reader: &Arc<dyn AsyncFileReader>,
87    header: &FixedArrayHeader,
88    size_of_offsets: u8,
89    size_of_lengths: u8,
90    uncompressed_chunk_size: u64,
91    layout_version: u8,
92) -> Result<Vec<FixedArrayChunkEntry>> {
93    if HDF5Reader::is_undef_addr(header.data_block_address, size_of_offsets) {
94        return Ok(vec![]);
95    }
96
97    let _is_filtered = header.client_id == 1;
98    let num_entries = header.max_num_entries as usize;
99
100    // Determine if paging is used
101    let use_paging = num_entries as u64 >= (1u64 << header.page_bits);
102
103    // FADB header: 4 (sig) + 1 (ver) + 1 (client) + O (header_addr) + ...
104    let fadb_header_size = 6 + size_of_offsets as u64;
105
106    if use_paging {
107        read_paged_entries(
108            reader,
109            header,
110            size_of_offsets,
111            size_of_lengths,
112            uncompressed_chunk_size,
113            layout_version,
114            fadb_header_size,
115        )
116        .await
117    } else {
118        read_unpaged_entries(
119            reader,
120            header,
121            size_of_offsets,
122            size_of_lengths,
123            uncompressed_chunk_size,
124            layout_version,
125            fadb_header_size,
126        )
127        .await
128    }
129}
130
131/// Read entries from an unpaged data block (all entries inline).
132async fn read_unpaged_entries(
133    reader: &Arc<dyn AsyncFileReader>,
134    header: &FixedArrayHeader,
135    size_of_offsets: u8,
136    size_of_lengths: u8,
137    uncompressed_chunk_size: u64,
138    layout_version: u8,
139    fadb_header_size: u64,
140) -> Result<Vec<FixedArrayChunkEntry>> {
141    let num_entries = header.max_num_entries as usize;
142    let entries_size = num_entries as u64 * header.entry_size as u64;
143    let total_size = fadb_header_size + entries_size + 4; // +4 for checksum
144
145    let data = reader
146        .get_bytes(header.data_block_address..header.data_block_address + total_size)
147        .await?;
148    let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
149
150    // Parse FADB header
151    r.read_signature(b"FADB")?;
152    let version = r.read_u8()?;
153    if version != 0 {
154        return Err(HDF5Error::General(format!(
155            "unsupported Fixed Array data block version: {version}"
156        )));
157    }
158    let _client_id = r.read_u8()?;
159    let _header_address = r.read_offset()?;
160
161    // Read entries
162    parse_entries(
163        &mut r,
164        num_entries,
165        header.client_id == 1,
166        size_of_offsets,
167        uncompressed_chunk_size,
168        layout_version,
169    )
170}
171
172/// Read entries from a paged data block.
173async fn read_paged_entries(
174    reader: &Arc<dyn AsyncFileReader>,
175    header: &FixedArrayHeader,
176    size_of_offsets: u8,
177    size_of_lengths: u8,
178    uncompressed_chunk_size: u64,
179    layout_version: u8,
180    fadb_header_size: u64,
181) -> Result<Vec<FixedArrayChunkEntry>> {
182    let num_entries = header.max_num_entries as usize;
183    let entries_per_page = 1usize << header.page_bits;
184    let num_pages = num_entries.div_ceil(entries_per_page);
185    let bitmap_size = num_pages.div_ceil(8);
186
187    // Fetch the FADB header + bitmap
188    let header_plus_bitmap = fadb_header_size + bitmap_size as u64 + 4; // +4 checksum
189    let data = reader
190        .get_bytes(header.data_block_address..header.data_block_address + header_plus_bitmap)
191        .await?;
192    let mut r = HDF5Reader::with_sizes(data, size_of_offsets, size_of_lengths);
193
194    r.read_signature(b"FADB")?;
195    let version = r.read_u8()?;
196    if version != 0 {
197        return Err(HDF5Error::General(format!(
198            "unsupported Fixed Array data block version: {version}"
199        )));
200    }
201    let _client_id = r.read_u8()?;
202    let _header_address = r.read_offset()?;
203
204    // Read bitmap
205    let bitmap_bytes = r.read_bytes(bitmap_size)?;
206
207    // Pages follow immediately after the FADB (header + bitmap + checksum)
208    let page_data_size = entries_per_page as u64 * header.entry_size as u64;
209    let page_total_size = page_data_size + 4; // +4 checksum per page
210    let pages_start = header.data_block_address + fadb_header_size + bitmap_size as u64 + 4;
211
212    let mut all_entries = Vec::with_capacity(num_entries);
213    let mut page_file_offset = pages_start;
214
215    for page_idx in 0..num_pages {
216        let byte_idx = page_idx / 8;
217        let bit_idx = page_idx % 8;
218        let page_allocated = (bitmap_bytes[byte_idx] >> bit_idx) & 1 != 0;
219
220        if !page_allocated {
221            // Unallocated page — fill with undefined entries
222            let entries_in_page = if page_idx == num_pages - 1 {
223                num_entries - page_idx * entries_per_page
224            } else {
225                entries_per_page
226            };
227            for _ in 0..entries_in_page {
228                all_entries.push(FixedArrayChunkEntry {
229                    address: u64::MAX, // undefined
230                    chunk_size: 0,
231                    filter_mask: 0,
232                });
233            }
234            // Don't advance file offset — unallocated pages have no storage
235            continue;
236        }
237
238        // Read this page
239        let page_data = reader
240            .get_bytes(page_file_offset..page_file_offset + page_total_size)
241            .await?;
242        let mut pr = HDF5Reader::with_sizes(page_data, size_of_offsets, size_of_lengths);
243
244        let entries_in_page = if page_idx == num_pages - 1 {
245            num_entries - page_idx * entries_per_page
246        } else {
247            entries_per_page
248        };
249
250        let page_entries = parse_entries(
251            &mut pr,
252            entries_in_page,
253            header.client_id == 1,
254            size_of_offsets,
255            uncompressed_chunk_size,
256            layout_version,
257        )?;
258        all_entries.extend(page_entries);
259
260        page_file_offset += page_total_size;
261    }
262
263    Ok(all_entries)
264}
265
266/// Parse chunk entries from a reader.
267pub(crate) fn parse_entries(
268    r: &mut HDF5Reader,
269    count: usize,
270    is_filtered: bool,
271    _size_of_offsets: u8,
272    uncompressed_chunk_size: u64,
273    layout_version: u8,
274) -> Result<Vec<FixedArrayChunkEntry>> {
275    let mut entries = Vec::with_capacity(count);
276
277    for _ in 0..count {
278        if is_filtered {
279            let address = r.read_offset()?;
280
281            let chunk_size = if layout_version >= 5 {
282                // v5: chunk size uses size_of_offsets width
283                r.read_offset()?
284            } else {
285                // v4: chunk size uses a variable encoding
286                // "entry_size - size_of_offsets - 4" bytes for the size field
287                let _size_field_len = (r.get_ref().len() as u64 - r.position()).min(8) as u8; // fallback
288                                                                                              // Actually, the entry_size tells us exactly how many bytes per entry.
289                                                                                              // entry_size = size_of_offsets + chunk_size_bytes + 4 (filter_mask)
290                                                                                              // So chunk_size_bytes = entry_size - size_of_offsets - 4
291                                                                                              // But we don't have entry_size here directly. We can compute it from
292                                                                                              // the uncompressed chunk size: "one more than bytes needed to encode it"
293                let nbytes = bytes_needed_for(uncompressed_chunk_size);
294                read_n_byte_uint(r, nbytes)?
295            };
296
297            let filter_mask = r.read_u32()?;
298
299            entries.push(FixedArrayChunkEntry {
300                address,
301                chunk_size,
302                filter_mask,
303            });
304        } else {
305            // Non-filtered: just an address
306            let address = r.read_offset()?;
307
308            entries.push(FixedArrayChunkEntry {
309                address,
310                chunk_size: uncompressed_chunk_size,
311                filter_mask: 0,
312            });
313        }
314    }
315
316    Ok(entries)
317}
318
319/// Calculate how many bytes are needed to encode a value, plus one (HDF5 convention).
320pub(crate) fn bytes_needed_for(value: u64) -> u8 {
321    if value == 0 {
322        return 1;
323    }
324    let bits = 64 - value.leading_zeros();
325    let bytes = bits.div_ceil(8);
326    // HDF5 convention: "one more than needed"
327    (bytes as u8 + 1).min(8)
328}
329
330/// Read an N-byte unsigned integer (little-endian).
331pub(crate) fn read_n_byte_uint(r: &mut HDF5Reader, n: u8) -> Result<u64> {
332    let bytes = r.read_bytes(n as usize)?;
333    let mut val = 0u64;
334    for (i, &b) in bytes.iter().enumerate() {
335        val |= (b as u64) << (i * 8);
336    }
337    Ok(val)
338}