Skip to main content

hdf5_reader/
chunk_index.rs

1//! Chunk indexing — resolves chunk locations from various storage strategies.
2//!
3//! Supports all six HDF5 chunk indexing types:
4//! - V1 B-tree chunk indexing (btree_v1 type 1) — dispatched externally
5//! - V2 B-tree chunk indexing (btree_v2 types 10 and 11)
6//! - Single chunk indexing
7//! - Implicit chunk indexing
8//! - Fixed array indexing (fixed_array module)
9//! - Extensible array indexing (extensible_array module)
10
11use crate::error::Result;
12use crate::storage::Storage;
13
14/// A resolved chunk location within the file.
15#[derive(Debug, Clone)]
16pub struct ChunkEntry {
17    /// Absolute file address of the chunk data.
18    pub address: u64,
19    /// Size of the chunk data in bytes (after filtering, i.e., on-disk size).
20    pub size: u64,
21    /// Filter mask — each bit indicates whether the corresponding filter
22    /// in the pipeline was skipped (1 = skipped).
23    pub filter_mask: u32,
24    /// Chunk offsets within the dataset (one per dimension).
25    pub offsets: Vec<u64>,
26}
27
28fn chunk_overlaps_bounds(
29    offsets: &[u64],
30    chunk_dims: &[u32],
31    chunk_bounds: Option<(&[u64], &[u64])>,
32) -> bool {
33    let Some((first_chunk, last_chunk)) = chunk_bounds else {
34        return true;
35    };
36
37    offsets.iter().enumerate().all(|(dim, offset)| {
38        let chunk_index = *offset / u64::from(chunk_dims[dim]);
39        chunk_index >= first_chunk[dim] && chunk_index <= last_chunk[dim]
40    })
41}
42
43fn chunk_linear_index(chunk_indices: &[u64], chunks_per_dim: &[u64]) -> u64 {
44    let mut linear = 0u64;
45    for (dim, chunk_index) in chunk_indices.iter().enumerate() {
46        linear = linear * chunks_per_dim[dim] + chunk_index;
47    }
48    linear
49}
50
51/// Collect chunk entries from a B-tree v2 chunk index.
52pub fn collect_v2_chunk_entries(
53    data: &[u8],
54    btree_address: u64,
55    offset_size: u8,
56    length_size: u8,
57    ndim: u32,
58    chunk_dims: &[u32],
59    chunk_bounds: Option<(&[u64], &[u64])>,
60) -> Result<Vec<ChunkEntry>> {
61    let mut cursor = crate::io::Cursor::new(data);
62    cursor.set_position(btree_address);
63    let header = crate::btree_v2::BTreeV2Header::parse(&mut cursor, offset_size, length_size)?;
64
65    let records = crate::btree_v2::collect_btree_v2_records(
66        data,
67        &header,
68        offset_size,
69        length_size,
70        Some(ndim),
71        chunk_dims,
72        chunk_bounds,
73    )?;
74
75    let mut entries = Vec::with_capacity(records.len());
76    for record in records {
77        match record {
78            crate::btree_v2::BTreeV2Record::ChunkedNonFiltered { address, offsets }
79                if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) =>
80            {
81                entries.push(ChunkEntry {
82                    address,
83                    size: 0, // caller must compute from chunk dims * elem_size
84                    filter_mask: 0,
85                    offsets,
86                });
87            }
88            crate::btree_v2::BTreeV2Record::ChunkedFiltered {
89                address,
90                chunk_size,
91                filter_mask,
92                offsets,
93            } if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) => {
94                entries.push(ChunkEntry {
95                    address,
96                    size: chunk_size,
97                    filter_mask,
98                    offsets,
99                });
100            }
101            _ => {
102                // Skip non-chunk records
103            }
104        }
105    }
106
107    Ok(entries)
108}
109
110/// Collect chunk entries from a B-tree v2 chunk index using random-access storage.
111pub fn collect_v2_chunk_entries_storage(
112    storage: &dyn Storage,
113    btree_address: u64,
114    offset_size: u8,
115    length_size: u8,
116    ndim: u32,
117    chunk_dims: &[u32],
118    chunk_bounds: Option<(&[u64], &[u64])>,
119) -> Result<Vec<ChunkEntry>> {
120    let header = crate::btree_v2::BTreeV2Header::parse_at_storage(
121        storage,
122        btree_address,
123        offset_size,
124        length_size,
125    )?;
126    let records = crate::btree_v2::collect_btree_v2_records_storage(
127        storage,
128        &header,
129        offset_size,
130        length_size,
131        Some(ndim),
132        chunk_dims,
133        chunk_bounds,
134    )?;
135
136    let mut entries = Vec::with_capacity(records.len());
137    for record in records {
138        match record {
139            crate::btree_v2::BTreeV2Record::ChunkedNonFiltered { address, offsets }
140                if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) =>
141            {
142                entries.push(ChunkEntry {
143                    address,
144                    size: 0,
145                    filter_mask: 0,
146                    offsets,
147                });
148            }
149            crate::btree_v2::BTreeV2Record::ChunkedFiltered {
150                address,
151                chunk_size,
152                filter_mask,
153                offsets,
154            } if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) => {
155                entries.push(ChunkEntry {
156                    address,
157                    size: chunk_size,
158                    filter_mask,
159                    offsets,
160                });
161            }
162            _ => {}
163        }
164    }
165
166    Ok(entries)
167}
168
169/// Collect chunk entries for implicit indexing.
170///
171/// Implicit chunks are laid out sequentially starting at the given address.
172/// Each chunk has the same size = product(chunk_dims) * elem_size.
173pub fn collect_implicit_chunk_entries(
174    start_address: u64,
175    dataset_shape: &[u64],
176    chunk_dims: &[u32],
177    elem_size: usize,
178    chunk_bounds: Option<(&[u64], &[u64])>,
179) -> Vec<ChunkEntry> {
180    let chunk_bytes: u64 = chunk_dims.iter().map(|&d| d as u64).product::<u64>() * elem_size as u64;
181    let ndim = dataset_shape.len();
182
183    // Compute how many chunks along each dimension
184    let chunks_per_dim: Vec<u64> = (0..ndim)
185        .map(|i| dataset_shape[i].div_ceil(chunk_dims[i] as u64))
186        .collect();
187
188    if ndim == 0 {
189        return vec![ChunkEntry {
190            address: start_address,
191            size: chunk_bytes,
192            filter_mask: 0,
193            offsets: Vec::new(),
194        }];
195    }
196
197    let (first_chunk, last_chunk): (Vec<u64>, Vec<u64>) = match chunk_bounds {
198        Some((first, last)) => (first.to_vec(), last.to_vec()),
199        None => (
200            vec![0u64; ndim],
201            chunks_per_dim
202                .iter()
203                .map(|count| count.saturating_sub(1))
204                .collect(),
205        ),
206    };
207
208    let mut chunk_counts = Vec::with_capacity(ndim);
209    for dim in 0..ndim {
210        chunk_counts.push(last_chunk[dim] - first_chunk[dim] + 1);
211    }
212    let total_selected_chunks: u64 = chunk_counts.iter().product();
213    let mut entries = Vec::with_capacity(total_selected_chunks as usize);
214    let mut chunk_indices = first_chunk.clone();
215
216    loop {
217        let chunk_idx = chunk_linear_index(&chunk_indices, &chunks_per_dim);
218        let offsets = chunk_indices
219            .iter()
220            .enumerate()
221            .map(|(dim, chunk_index)| chunk_index * u64::from(chunk_dims[dim]))
222            .collect();
223
224        entries.push(ChunkEntry {
225            address: start_address + chunk_idx * chunk_bytes,
226            size: chunk_bytes,
227            filter_mask: 0,
228            offsets,
229        });
230
231        let mut advanced = false;
232        for dim in (0..ndim).rev() {
233            if chunk_indices[dim] < last_chunk[dim] {
234                chunk_indices[dim] += 1;
235                if dim + 1 < ndim {
236                    chunk_indices[(dim + 1)..ndim].copy_from_slice(&first_chunk[(dim + 1)..ndim]);
237                }
238                advanced = true;
239                break;
240            }
241        }
242
243        if !advanced {
244            break;
245        }
246    }
247
248    entries
249}
250
251/// Resolve a single-chunk layout.
252///
253/// The entire dataset is stored as one chunk at the given address.
254pub fn single_chunk_entry(
255    address: u64,
256    filtered_size: u64,
257    filter_mask: u32,
258    ndim: usize,
259) -> ChunkEntry {
260    ChunkEntry {
261        address,
262        size: filtered_size,
263        filter_mask,
264        offsets: vec![0u64; ndim],
265    }
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    #[test]
273    fn test_chunk_entry_debug_clone() {
274        let entry = ChunkEntry {
275            address: 0x1000,
276            size: 4096,
277            filter_mask: 0,
278            offsets: vec![0, 0],
279        };
280        let entry2 = entry.clone();
281        assert_eq!(entry2.address, 0x1000);
282        let _ = format!("{:?}", entry);
283    }
284
285    #[test]
286    fn test_implicit_chunk_entries() {
287        let entries = collect_implicit_chunk_entries(1000, &[10, 20], &[5, 10], 4, None);
288        // 2 chunks along dim 0, 2 chunks along dim 1 = 4 total
289        assert_eq!(entries.len(), 4);
290        assert_eq!(entries[0].address, 1000);
291        assert_eq!(entries[0].offsets, vec![0, 0]);
292        assert_eq!(entries[1].address, 1000 + 200); // 5*10*4 = 200
293        assert_eq!(entries[1].offsets, vec![0, 10]);
294        assert_eq!(entries[2].offsets, vec![5, 0]);
295        assert_eq!(entries[3].offsets, vec![5, 10]);
296    }
297
298    #[test]
299    fn test_single_chunk_entry() {
300        let entry = single_chunk_entry(0x2000, 8192, 0, 3);
301        assert_eq!(entry.address, 0x2000);
302        assert_eq!(entry.size, 8192);
303        assert_eq!(entry.offsets, vec![0, 0, 0]);
304    }
305}