Skip to main content

hdf5_reader/
chunk_index.rs

1//! Chunk indexing — resolves chunk locations from various storage strategies.
2//!
3//! Supports all six HDF5 chunk indexing types:
4//! - V1 B-tree chunk indexing (btree_v1 type 1) — dispatched externally
5//! - V2 B-tree chunk indexing (btree_v2 types 10 and 11)
6//! - Single chunk indexing
7//! - Implicit chunk indexing
8//! - Fixed array indexing (fixed_array module)
9//! - Extensible array indexing (extensible_array module)
10
11use crate::error::Result;
12
13/// A resolved chunk location within the file.
14#[derive(Debug, Clone)]
15pub struct ChunkEntry {
16    /// Absolute file address of the chunk data.
17    pub address: u64,
18    /// Size of the chunk data in bytes (after filtering, i.e., on-disk size).
19    pub size: u64,
20    /// Filter mask — each bit indicates whether the corresponding filter
21    /// in the pipeline was skipped (1 = skipped).
22    pub filter_mask: u32,
23    /// Chunk offsets within the dataset (one per dimension).
24    pub offsets: Vec<u64>,
25}
26
27fn chunk_overlaps_bounds(
28    offsets: &[u64],
29    chunk_dims: &[u32],
30    chunk_bounds: Option<(&[u64], &[u64])>,
31) -> bool {
32    let Some((first_chunk, last_chunk)) = chunk_bounds else {
33        return true;
34    };
35
36    offsets.iter().enumerate().all(|(dim, offset)| {
37        let chunk_index = *offset / u64::from(chunk_dims[dim]);
38        chunk_index >= first_chunk[dim] && chunk_index <= last_chunk[dim]
39    })
40}
41
42fn chunk_linear_index(chunk_indices: &[u64], chunks_per_dim: &[u64]) -> u64 {
43    let mut linear = 0u64;
44    for (dim, chunk_index) in chunk_indices.iter().enumerate() {
45        linear = linear * chunks_per_dim[dim] + chunk_index;
46    }
47    linear
48}
49
50/// Collect chunk entries from a B-tree v2 chunk index.
51pub fn collect_v2_chunk_entries(
52    data: &[u8],
53    btree_address: u64,
54    offset_size: u8,
55    length_size: u8,
56    ndim: u32,
57    chunk_dims: &[u32],
58    chunk_bounds: Option<(&[u64], &[u64])>,
59) -> Result<Vec<ChunkEntry>> {
60    let mut cursor = crate::io::Cursor::new(data);
61    cursor.set_position(btree_address);
62    let header = crate::btree_v2::BTreeV2Header::parse(&mut cursor, offset_size, length_size)?;
63
64    let records = crate::btree_v2::collect_btree_v2_records(
65        data,
66        &header,
67        offset_size,
68        length_size,
69        Some(ndim),
70        chunk_dims,
71        chunk_bounds,
72    )?;
73
74    let mut entries = Vec::with_capacity(records.len());
75    for record in records {
76        match record {
77            crate::btree_v2::BTreeV2Record::ChunkedNonFiltered { address, offsets } => {
78                if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) {
79                    entries.push(ChunkEntry {
80                        address,
81                        size: 0, // caller must compute from chunk dims * elem_size
82                        filter_mask: 0,
83                        offsets,
84                    });
85                }
86            }
87            crate::btree_v2::BTreeV2Record::ChunkedFiltered {
88                address,
89                chunk_size,
90                filter_mask,
91                offsets,
92            } => {
93                if chunk_overlaps_bounds(&offsets, chunk_dims, chunk_bounds) {
94                    entries.push(ChunkEntry {
95                        address,
96                        size: chunk_size,
97                        filter_mask,
98                        offsets,
99                    });
100                }
101            }
102            _ => {
103                // Skip non-chunk records
104            }
105        }
106    }
107
108    Ok(entries)
109}
110
111/// Collect chunk entries for implicit indexing.
112///
113/// Implicit chunks are laid out sequentially starting at the given address.
114/// Each chunk has the same size = product(chunk_dims) * elem_size.
115pub fn collect_implicit_chunk_entries(
116    start_address: u64,
117    dataset_shape: &[u64],
118    chunk_dims: &[u32],
119    elem_size: usize,
120    chunk_bounds: Option<(&[u64], &[u64])>,
121) -> Vec<ChunkEntry> {
122    let chunk_bytes: u64 = chunk_dims.iter().map(|&d| d as u64).product::<u64>() * elem_size as u64;
123    let ndim = dataset_shape.len();
124
125    // Compute how many chunks along each dimension
126    let chunks_per_dim: Vec<u64> = (0..ndim)
127        .map(|i| dataset_shape[i].div_ceil(chunk_dims[i] as u64))
128        .collect();
129
130    if ndim == 0 {
131        return vec![ChunkEntry {
132            address: start_address,
133            size: chunk_bytes,
134            filter_mask: 0,
135            offsets: Vec::new(),
136        }];
137    }
138
139    let (first_chunk, last_chunk): (Vec<u64>, Vec<u64>) = match chunk_bounds {
140        Some((first, last)) => (first.to_vec(), last.to_vec()),
141        None => (
142            vec![0u64; ndim],
143            chunks_per_dim
144                .iter()
145                .map(|count| count.saturating_sub(1))
146                .collect(),
147        ),
148    };
149
150    let mut chunk_counts = Vec::with_capacity(ndim);
151    for dim in 0..ndim {
152        chunk_counts.push(last_chunk[dim] - first_chunk[dim] + 1);
153    }
154    let total_selected_chunks: u64 = chunk_counts.iter().product();
155    let mut entries = Vec::with_capacity(total_selected_chunks as usize);
156    let mut chunk_indices = first_chunk.clone();
157
158    loop {
159        let chunk_idx = chunk_linear_index(&chunk_indices, &chunks_per_dim);
160        let offsets = chunk_indices
161            .iter()
162            .enumerate()
163            .map(|(dim, chunk_index)| chunk_index * u64::from(chunk_dims[dim]))
164            .collect();
165
166        entries.push(ChunkEntry {
167            address: start_address + chunk_idx * chunk_bytes,
168            size: chunk_bytes,
169            filter_mask: 0,
170            offsets,
171        });
172
173        let mut advanced = false;
174        for dim in (0..ndim).rev() {
175            if chunk_indices[dim] < last_chunk[dim] {
176                chunk_indices[dim] += 1;
177                if dim + 1 < ndim {
178                    chunk_indices[(dim + 1)..ndim].copy_from_slice(&first_chunk[(dim + 1)..ndim]);
179                }
180                advanced = true;
181                break;
182            }
183        }
184
185        if !advanced {
186            break;
187        }
188    }
189
190    entries
191}
192
193/// Resolve a single-chunk layout.
194///
195/// The entire dataset is stored as one chunk at the given address.
196pub fn single_chunk_entry(
197    address: u64,
198    filtered_size: u64,
199    filter_mask: u32,
200    ndim: usize,
201) -> ChunkEntry {
202    ChunkEntry {
203        address,
204        size: filtered_size,
205        filter_mask,
206        offsets: vec![0u64; ndim],
207    }
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    #[test]
215    fn test_chunk_entry_debug_clone() {
216        let entry = ChunkEntry {
217            address: 0x1000,
218            size: 4096,
219            filter_mask: 0,
220            offsets: vec![0, 0],
221        };
222        let entry2 = entry.clone();
223        assert_eq!(entry2.address, 0x1000);
224        let _ = format!("{:?}", entry);
225    }
226
227    #[test]
228    fn test_implicit_chunk_entries() {
229        let entries = collect_implicit_chunk_entries(1000, &[10, 20], &[5, 10], 4, None);
230        // 2 chunks along dim 0, 2 chunks along dim 1 = 4 total
231        assert_eq!(entries.len(), 4);
232        assert_eq!(entries[0].address, 1000);
233        assert_eq!(entries[0].offsets, vec![0, 0]);
234        assert_eq!(entries[1].address, 1000 + 200); // 5*10*4 = 200
235        assert_eq!(entries[1].offsets, vec![0, 10]);
236        assert_eq!(entries[2].offsets, vec![5, 0]);
237        assert_eq!(entries[3].offsets, vec![5, 10]);
238    }
239
240    #[test]
241    fn test_single_chunk_entry() {
242        let entry = single_chunk_entry(0x2000, 8192, 0, 3);
243        assert_eq!(entry.address, 0x2000);
244        assert_eq!(entry.size, 8192);
245        assert_eq!(entry.offsets, vec![0, 0, 0]);
246    }
247}