Skip to main content

hdf5_reader/
chunk_index.rs

1//! Chunk indexing — resolves chunk locations from various storage strategies.
2//!
3//! Supports all six HDF5 chunk indexing types:
4//! - V1 B-tree chunk indexing (btree_v1 type 1) — dispatched externally
5//! - V2 B-tree chunk indexing (btree_v2 types 10 and 11)
6//! - Single chunk indexing
7//! - Implicit chunk indexing
8//! - Fixed array indexing (fixed_array module)
9//! - Extensible array indexing (extensible_array module)
10
11use crate::error::{Error, Result};
12use crate::storage::Storage;
13
14/// A resolved chunk location within the file.
15#[derive(Debug, Clone)]
16pub struct ChunkEntry {
17    /// Absolute file address of the chunk data.
18    pub address: u64,
19    /// Size of the chunk data in bytes (after filtering, i.e., on-disk size).
20    pub size: u64,
21    /// Filter mask — each bit indicates whether the corresponding filter
22    /// in the pipeline was skipped (1 = skipped).
23    pub filter_mask: u32,
24    /// Chunk offsets within the dataset (one per dimension).
25    pub offsets: Vec<u64>,
26}
27
28fn chunk_overlaps_bounds(
29    offsets: &[u64],
30    chunk_dims: &[u32],
31    chunk_bounds: Option<(&[u64], &[u64])>,
32) -> bool {
33    let Some((first_chunk, last_chunk)) = chunk_bounds else {
34        return true;
35    };
36
37    offsets.iter().enumerate().all(|(dim, offset)| {
38        let chunk_index = *offset / u64::from(chunk_dims[dim]);
39        chunk_index >= first_chunk[dim] && chunk_index <= last_chunk[dim]
40    })
41}
42
43fn checked_mul_u64(lhs: u64, rhs: u64, context: &str) -> Result<u64> {
44    lhs.checked_mul(rhs)
45        .ok_or_else(|| Error::InvalidData(format!("{context} overflows u64")))
46}
47
48fn checked_add_u64(lhs: u64, rhs: u64, context: &str) -> Result<u64> {
49    lhs.checked_add(rhs)
50        .ok_or_else(|| Error::InvalidData(format!("{context} overflows u64")))
51}
52
53fn checked_usize(value: u64, context: &str) -> Result<usize> {
54    usize::try_from(value).map_err(|_| {
55        Error::InvalidData(format!(
56            "{context} value {value} exceeds platform usize capacity"
57        ))
58    })
59}
60
61fn chunk_linear_index(chunk_indices: &[u64], chunks_per_dim: &[u64]) -> Result<u64> {
62    let mut linear = 0u64;
63    for (dim, chunk_index) in chunk_indices.iter().enumerate() {
64        linear = checked_mul_u64(linear, chunks_per_dim[dim], "implicit chunk linear index")?;
65        linear = checked_add_u64(linear, *chunk_index, "implicit chunk linear index")?;
66    }
67    Ok(linear)
68}
69
70/// Collect chunk entries from a B-tree v2 chunk index.
71pub fn collect_v2_chunk_entries(
72    data: &[u8],
73    btree_address: u64,
74    offset_size: u8,
75    length_size: u8,
76    ndim: u32,
77    chunk_dims: &[u32],
78    chunk_bounds: Option<(&[u64], &[u64])>,
79) -> Result<Vec<ChunkEntry>> {
80    let mut cursor = crate::io::Cursor::new(data);
81    cursor.set_position(btree_address);
82    let header = crate::btree_v2::BTreeV2Header::parse(&mut cursor, offset_size, length_size)?;
83
84    let records = crate::btree_v2::collect_btree_v2_records(
85        data,
86        &header,
87        offset_size,
88        length_size,
89        Some(ndim),
90        chunk_dims,
91        chunk_bounds,
92    )?;
93
94    let mut entries = Vec::with_capacity(records.len());
95    for record in records {
96        match record {
97            crate::btree_v2::BTreeV2Record::ChunkedNonFiltered { address, offsets }
98                if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) =>
99            {
100                entries.push(ChunkEntry {
101                    address,
102                    size: 0, // caller must compute from chunk dims * elem_size
103                    filter_mask: 0,
104                    offsets,
105                });
106            }
107            crate::btree_v2::BTreeV2Record::ChunkedFiltered {
108                address,
109                chunk_size,
110                filter_mask,
111                offsets,
112            } if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) => {
113                entries.push(ChunkEntry {
114                    address,
115                    size: chunk_size,
116                    filter_mask,
117                    offsets,
118                });
119            }
120            _ => {
121                // Skip non-chunk records
122            }
123        }
124    }
125
126    Ok(entries)
127}
128
129/// Collect chunk entries from a B-tree v2 chunk index using random-access storage.
130pub fn collect_v2_chunk_entries_storage(
131    storage: &dyn Storage,
132    btree_address: u64,
133    offset_size: u8,
134    length_size: u8,
135    ndim: u32,
136    chunk_dims: &[u32],
137    chunk_bounds: Option<(&[u64], &[u64])>,
138) -> Result<Vec<ChunkEntry>> {
139    let header = crate::btree_v2::BTreeV2Header::parse_at_storage(
140        storage,
141        btree_address,
142        offset_size,
143        length_size,
144    )?;
145    let records = crate::btree_v2::collect_btree_v2_records_storage(
146        storage,
147        &header,
148        offset_size,
149        length_size,
150        Some(ndim),
151        chunk_dims,
152        chunk_bounds,
153    )?;
154
155    let mut entries = Vec::with_capacity(records.len());
156    for record in records {
157        match record {
158            crate::btree_v2::BTreeV2Record::ChunkedNonFiltered { address, offsets }
159                if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) =>
160            {
161                entries.push(ChunkEntry {
162                    address,
163                    size: 0,
164                    filter_mask: 0,
165                    offsets,
166                });
167            }
168            crate::btree_v2::BTreeV2Record::ChunkedFiltered {
169                address,
170                chunk_size,
171                filter_mask,
172                offsets,
173            } if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) => {
174                entries.push(ChunkEntry {
175                    address,
176                    size: chunk_size,
177                    filter_mask,
178                    offsets,
179                });
180            }
181            _ => {}
182        }
183    }
184
185    Ok(entries)
186}
187
188/// Collect chunk entries for implicit indexing.
189///
190/// Implicit chunks are laid out sequentially starting at the given address.
191/// Each chunk has the same size = product(chunk_dims) * elem_size.
192pub fn collect_implicit_chunk_entries(
193    start_address: u64,
194    dataset_shape: &[u64],
195    chunk_dims: &[u32],
196    elem_size: usize,
197    chunk_bounds: Option<(&[u64], &[u64])>,
198) -> Result<Vec<ChunkEntry>> {
199    let chunk_elements = chunk_dims.iter().try_fold(1u64, |acc, &dim| {
200        checked_mul_u64(acc, u64::from(dim), "implicit chunk element count")
201    })?;
202    let elem_size = u64::try_from(elem_size).map_err(|_| {
203        Error::InvalidData("implicit chunk element size exceeds u64 capacity".to_string())
204    })?;
205    let chunk_bytes = checked_mul_u64(chunk_elements, elem_size, "implicit chunk byte size")?;
206    let ndim = dataset_shape.len();
207
208    // Compute how many chunks along each dimension
209    let mut chunks_per_dim = Vec::with_capacity(ndim);
210    for i in 0..ndim {
211        let chunk_dim = u64::from(chunk_dims[i]);
212        if chunk_dim == 0 {
213            return Err(Error::InvalidData(format!(
214                "implicit chunk dimension {i} has zero extent"
215            )));
216        }
217        chunks_per_dim.push(dataset_shape[i].div_ceil(chunk_dim));
218    }
219
220    if ndim == 0 {
221        return Ok(vec![ChunkEntry {
222            address: start_address,
223            size: chunk_bytes,
224            filter_mask: 0,
225            offsets: Vec::new(),
226        }]);
227    }
228
229    let (first_chunk, last_chunk): (Vec<u64>, Vec<u64>) = match chunk_bounds {
230        Some((first, last)) => (first.to_vec(), last.to_vec()),
231        None => (
232            vec![0u64; ndim],
233            chunks_per_dim
234                .iter()
235                .map(|count| count.saturating_sub(1))
236                .collect(),
237        ),
238    };
239
240    let mut chunk_counts = Vec::with_capacity(ndim);
241    for dim in 0..ndim {
242        let selected = last_chunk[dim]
243            .checked_sub(first_chunk[dim])
244            .and_then(|value| value.checked_add(1))
245            .ok_or_else(|| {
246                Error::InvalidData("implicit chunk selection bounds are invalid".to_string())
247            })?;
248        chunk_counts.push(selected);
249    }
250    let total_selected_chunks = chunk_counts.iter().try_fold(1u64, |acc, &count| {
251        checked_mul_u64(acc, count, "implicit selected chunk count")
252    })?;
253    let mut entries = Vec::with_capacity(checked_usize(
254        total_selected_chunks,
255        "implicit selected chunk count",
256    )?);
257    let mut chunk_indices = first_chunk.clone();
258
259    loop {
260        let chunk_idx = chunk_linear_index(&chunk_indices, &chunks_per_dim)?;
261        let offsets = chunk_indices
262            .iter()
263            .enumerate()
264            .map(|(dim, chunk_index)| {
265                checked_mul_u64(
266                    *chunk_index,
267                    u64::from(chunk_dims[dim]),
268                    "implicit chunk offset",
269                )
270            })
271            .collect::<Result<Vec<_>>>()?;
272        let chunk_data_offset =
273            checked_mul_u64(chunk_idx, chunk_bytes, "implicit chunk byte offset")?;
274
275        entries.push(ChunkEntry {
276            address: checked_add_u64(start_address, chunk_data_offset, "implicit chunk address")?,
277            size: chunk_bytes,
278            filter_mask: 0,
279            offsets,
280        });
281
282        let mut advanced = false;
283        for dim in (0..ndim).rev() {
284            if chunk_indices[dim] < last_chunk[dim] {
285                chunk_indices[dim] += 1;
286                if dim + 1 < ndim {
287                    chunk_indices[(dim + 1)..ndim].copy_from_slice(&first_chunk[(dim + 1)..ndim]);
288                }
289                advanced = true;
290                break;
291            }
292        }
293
294        if !advanced {
295            break;
296        }
297    }
298
299    Ok(entries)
300}
301
302/// Resolve a single-chunk layout.
303///
304/// The entire dataset is stored as one chunk at the given address.
305pub fn single_chunk_entry(
306    address: u64,
307    filtered_size: u64,
308    filter_mask: u32,
309    ndim: usize,
310) -> ChunkEntry {
311    ChunkEntry {
312        address,
313        size: filtered_size,
314        filter_mask,
315        offsets: vec![0u64; ndim],
316    }
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322
323    #[test]
324    fn chunk_entry_debug_clone() {
325        let entry = ChunkEntry {
326            address: 0x1000,
327            size: 4096,
328            filter_mask: 0,
329            offsets: vec![0, 0],
330        };
331        let entry2 = entry.clone();
332        assert_eq!(entry2.address, 0x1000);
333        let _ = format!("{:?}", entry);
334    }
335
336    #[test]
337    fn implicit_chunk_entries() {
338        let entries = collect_implicit_chunk_entries(1000, &[10, 20], &[5, 10], 4, None).unwrap();
339        // 2 chunks along dim 0, 2 chunks along dim 1 = 4 total
340        assert_eq!(entries.len(), 4);
341        assert_eq!(entries[0].address, 1000);
342        assert_eq!(entries[0].offsets, vec![0, 0]);
343        assert_eq!(entries[1].address, 1000 + 200); // 5*10*4 = 200
344        assert_eq!(entries[1].offsets, vec![0, 10]);
345        assert_eq!(entries[2].offsets, vec![5, 0]);
346        assert_eq!(entries[3].offsets, vec![5, 10]);
347    }
348
349    #[test]
350    fn implicit_chunk_entries_reject_chunk_byte_overflow() {
351        let err = collect_implicit_chunk_entries(1000, &[10, 10], &[u32::MAX, u32::MAX], 2, None)
352            .unwrap_err();
353        assert!(err.to_string().contains("implicit chunk byte size"));
354    }
355
356    #[test]
357    fn implicit_chunk_entries_reject_address_overflow() {
358        let err = collect_implicit_chunk_entries(u64::MAX, &[2], &[1], 1, Some((&[1], &[1])))
359            .unwrap_err();
360        assert!(err.to_string().contains("implicit chunk address"));
361    }
362
363    #[test]
364    fn single_chunk_entry_uses_origin_offsets() {
365        let entry = single_chunk_entry(0x2000, 8192, 0, 3);
366        assert_eq!(entry.address, 0x2000);
367        assert_eq!(entry.size, 8192);
368        assert_eq!(entry.offsets, vec![0, 0, 0]);
369    }
370}