Skip to main content

rustyhdf5_format/
fixed_array.rs

1//! HDF5 Fixed Array index parsing for chunked datasets (v4 index type 3).
2
3#[cfg(not(feature = "std"))]
4extern crate alloc;
5
6#[cfg(not(feature = "std"))]
7use alloc::{format, vec, vec::Vec};
8
9use crate::chunked_read::ChunkInfo;
10use crate::error::FormatError;
11
12/// Parsed Fixed Array header (FAHD).
13#[derive(Debug, Clone)]
14pub struct FixedArrayHeader {
15    /// Client ID: 0 = non-filtered chunks, 1 = filtered chunks.
16    pub client_id: u8,
17    /// Size of each array element in bytes.
18    pub element_size: u8,
19    /// Log2 of max number of elements in a data block page.
20    pub max_nelmts_bits: u8,
21    /// Total number of elements (chunks) in the array.
22    pub num_elements: u64,
23    /// Address of the data block.
24    pub data_block_address: u64,
25}
26
27fn read_offset(data: &[u8], pos: usize, size: u8) -> Result<u64, FormatError> {
28    let s = size as usize;
29    if pos + s > data.len() {
30        return Err(FormatError::UnexpectedEof {
31            expected: pos + s,
32            available: data.len(),
33        });
34    }
35    let slice = &data[pos..pos + s];
36    Ok(match size {
37        2 => u16::from_le_bytes([slice[0], slice[1]]) as u64,
38        4 => u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]) as u64,
39        8 => u64::from_le_bytes([
40            slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7],
41        ]),
42        _ => return Err(FormatError::InvalidOffsetSize(size)),
43    })
44}
45
46fn read_length(data: &[u8], pos: usize, size: u8) -> Result<u64, FormatError> {
47    read_offset(data, pos, size)
48}
49
50fn is_undefined(data: &[u8], pos: usize, size: u8) -> bool {
51    let s = size as usize;
52    if pos + s > data.len() {
53        return false;
54    }
55    data[pos..pos + s].iter().all(|&b| b == 0xFF)
56}
57
58impl FixedArrayHeader {
59    /// Parse a Fixed Array header from file data at the given offset.
60    pub fn parse(
61        file_data: &[u8],
62        offset: usize,
63        offset_size: u8,
64        length_size: u8,
65    ) -> Result<Self, FormatError> {
66        // FAHD signature(4) + version(1) + client_id(1) + element_size(1) +
67        // max_nelmts_bits(1) + num_elements(length_size) + data_block_addr(offset_size) + checksum(4)
68        let min_size = 4 + 1 + 1 + 1 + 1 + length_size as usize + offset_size as usize + 4;
69        if offset + min_size > file_data.len() {
70            return Err(FormatError::UnexpectedEof {
71                expected: offset + min_size,
72                available: file_data.len(),
73            });
74        }
75
76        let d = &file_data[offset..];
77        if &d[0..4] != b"FAHD" {
78            return Err(FormatError::ChunkedReadError(
79                "invalid Fixed Array header signature".into(),
80            ));
81        }
82
83        let version = d[4];
84        if version != 0 {
85            return Err(FormatError::ChunkedReadError(
86                format!("unsupported Fixed Array header version: {version}"),
87            ));
88        }
89
90        let client_id = d[5];
91        let element_size = d[6];
92        let max_nelmts_bits = d[7];
93
94        let mut pos = 8;
95        let num_elements = read_length(d, pos, length_size)?;
96        pos += length_size as usize;
97        let data_block_address = read_offset(d, pos, offset_size)?;
98
99        Ok(FixedArrayHeader {
100            client_id,
101            element_size,
102            max_nelmts_bits,
103            num_elements,
104            data_block_address,
105        })
106    }
107}
108
109/// Read chunk records from a Fixed Array data block.
110///
111/// Returns a `Vec<ChunkInfo>` with one entry per allocated chunk.
112/// `chunk_dimensions` should be the spatial chunk dims only (not including the element-size dim).
113/// `element_size` is the datatype size in bytes.
114#[allow(clippy::too_many_arguments)]
115pub fn read_fixed_array_chunks(
116    file_data: &[u8],
117    header: &FixedArrayHeader,
118    dataset_dims: &[u64],
119    chunk_dimensions: &[u32],
120    element_size: u32,
121    offset_size: u8,
122    _length_size: u8,
123) -> Result<Vec<ChunkInfo>, FormatError> {
124    let db_offset = header.data_block_address as usize;
125    let rank = chunk_dimensions.len();
126
127    // Parse data block header: FADB(4) + version(1) + client_id(1) + header_address(offset_size)
128    let db_header_size = 4 + 1 + 1 + offset_size as usize;
129    if db_offset + db_header_size > file_data.len() {
130        return Err(FormatError::UnexpectedEof {
131            expected: db_offset + db_header_size,
132            available: file_data.len(),
133        });
134    }
135
136    let d = &file_data[db_offset..];
137    if &d[0..4] != b"FADB" {
138        return Err(FormatError::ChunkedReadError(
139            "invalid Fixed Array data block signature".into(),
140        ));
141    }
142
143    // Skip version(1) + client_id(1) + header_address(offset_size)
144    let mut pos = db_header_size;
145
146    // Check if paged
147    let page_size = 1u64 << header.max_nelmts_bits;
148    let is_paged = header.num_elements > page_size;
149
150    if is_paged {
151        // For paged data blocks, we need to handle page bitmap + pages
152        // For now, implement non-paged path (covers most real-world cases)
153        return Err(FormatError::ChunkedReadError(
154            "paged Fixed Array data blocks not yet supported".into(),
155        ));
156    }
157
158    // Non-paged: elements stored directly
159    let num_elements = header.num_elements as usize;
160    let os = offset_size as usize;
161
162    // Compute chunk offsets based on index
163    // Chunks are stored in row-major order within the dataset space
164    let mut num_chunks_per_dim = Vec::with_capacity(rank);
165    for d_idx in 0..rank {
166        let ds_dim = dataset_dims[d_idx];
167        let ch_dim = chunk_dimensions[d_idx] as u64;
168        num_chunks_per_dim.push(ds_dim.div_ceil(ch_dim));
169    }
170
171    let chunk_byte_size: u64 = chunk_dimensions.iter().map(|&d| d as u64).product::<u64>()
172        * element_size as u64;
173
174    let mut chunks = Vec::new();
175
176    for i in 0..num_elements {
177        let elem_data = &file_data[db_offset + pos..];
178        if header.client_id == 0 {
179            // Non-filtered: just address
180            if pos + os > file_data.len() - db_offset {
181                return Err(FormatError::UnexpectedEof {
182                    expected: db_offset + pos + os,
183                    available: file_data.len(),
184                });
185            }
186            let address = read_offset(elem_data, 0, offset_size)?;
187            pos += os;
188
189            if is_undefined(file_data, db_offset + pos - os, offset_size) {
190                continue; // unallocated chunk
191            }
192
193            let offsets = index_to_chunk_offsets(i, &num_chunks_per_dim, chunk_dimensions);
194            chunks.push(ChunkInfo {
195                chunk_size: chunk_byte_size as u32,
196                filter_mask: 0,
197                offsets,
198                address,
199            });
200        } else {
201            // Filtered: address(offset_size) + chunk_size(variable) + filter_mask(4)
202            let chunk_size_bytes = header.element_size as usize - os - 4;
203            let elem_total = os + chunk_size_bytes + 4;
204            if pos + elem_total > file_data.len() - db_offset {
205                return Err(FormatError::UnexpectedEof {
206                    expected: db_offset + pos + elem_total,
207                    available: file_data.len(),
208                });
209            }
210
211            let address = read_offset(elem_data, 0, offset_size)?;
212
213            // Read chunk_size (variable length, little-endian)
214            let chunk_size = read_variable_length(&elem_data[os..], chunk_size_bytes)?;
215
216            let fm_off = os + chunk_size_bytes;
217            let filter_mask = u32::from_le_bytes([
218                elem_data[fm_off],
219                elem_data[fm_off + 1],
220                elem_data[fm_off + 2],
221                elem_data[fm_off + 3],
222            ]);
223            pos += elem_total;
224
225            if is_undefined(file_data, db_offset + pos - elem_total, offset_size) {
226                continue; // unallocated chunk
227            }
228
229            let offsets = index_to_chunk_offsets(i, &num_chunks_per_dim, chunk_dimensions);
230            chunks.push(ChunkInfo {
231                chunk_size: chunk_size as u32,
232                filter_mask,
233                offsets,
234                address,
235            });
236        }
237    }
238
239    Ok(chunks)
240}
241
242/// Convert a linear chunk index to N-dimensional chunk offsets in dataset space.
243fn index_to_chunk_offsets(
244    index: usize,
245    num_chunks_per_dim: &[u64],
246    chunk_dimensions: &[u32],
247) -> Vec<u64> {
248    let rank = num_chunks_per_dim.len();
249    let mut offsets = vec![0u64; rank];
250    let mut remaining = index as u64;
251    for d in (0..rank).rev() {
252        let nchunks = num_chunks_per_dim[d];
253        let chunk_idx = remaining % nchunks;
254        remaining /= nchunks;
255        offsets[d] = chunk_idx * chunk_dimensions[d] as u64;
256    }
257    offsets
258}
259
260/// Read a variable-length little-endian unsigned integer.
261fn read_variable_length(data: &[u8], size: usize) -> Result<u64, FormatError> {
262    if size > 8 || data.len() < size {
263        return Err(FormatError::ChunkedReadError(
264            "invalid variable-length size".into(),
265        ));
266    }
267    let mut val = 0u64;
268    for (i, &byte) in data.iter().enumerate().take(size) {
269        val |= (byte as u64) << (i * 8);
270    }
271    Ok(val)
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277
278    #[test]
279    fn index_to_offsets_1d() {
280        let num_chunks = vec![5u64];
281        let chunk_dims = vec![20u32];
282        assert_eq!(index_to_chunk_offsets(0, &num_chunks, &chunk_dims), vec![0]);
283        assert_eq!(index_to_chunk_offsets(1, &num_chunks, &chunk_dims), vec![20]);
284        assert_eq!(index_to_chunk_offsets(4, &num_chunks, &chunk_dims), vec![80]);
285    }
286
287    #[test]
288    fn index_to_offsets_2d() {
289        // 10x6 dataset with 4x3 chunks => ceil(10/4)=3, ceil(6/3)=2 => 6 chunks
290        let num_chunks = vec![3u64, 2];
291        let chunk_dims = vec![4u32, 3];
292        assert_eq!(index_to_chunk_offsets(0, &num_chunks, &chunk_dims), vec![0, 0]);
293        assert_eq!(index_to_chunk_offsets(1, &num_chunks, &chunk_dims), vec![0, 3]);
294        assert_eq!(index_to_chunk_offsets(2, &num_chunks, &chunk_dims), vec![4, 0]);
295        assert_eq!(index_to_chunk_offsets(3, &num_chunks, &chunk_dims), vec![4, 3]);
296        assert_eq!(index_to_chunk_offsets(5, &num_chunks, &chunk_dims), vec![8, 3]);
297    }
298
299    #[test]
300    fn read_variable_length_values() {
301        assert_eq!(read_variable_length(&[0x78, 0x56], 2).unwrap(), 0x5678);
302        assert_eq!(read_variable_length(&[0x01, 0x02, 0x03, 0x04], 4).unwrap(), 0x04030201);
303        assert_eq!(read_variable_length(&[0xFF], 1).unwrap(), 0xFF);
304    }
305
306    #[test]
307    fn parse_fixed_array_header_valid() {
308        let mut buf = vec![0u8; 256];
309        // FAHD signature
310        buf[0..4].copy_from_slice(b"FAHD");
311        buf[4] = 0; // version
312        buf[5] = 1; // client_id = filtered
313        buf[6] = 16; // element_size
314        buf[7] = 10; // max_nelmts_bits (page_size = 1024)
315        // num_elements (length_size=8)
316        buf[8..16].copy_from_slice(&5u64.to_le_bytes());
317        // data_block_address (offset_size=8)
318        buf[16..24].copy_from_slice(&0x1000u64.to_le_bytes());
319        // checksum (4 bytes, we don't validate in parse)
320
321        let header = FixedArrayHeader::parse(&buf, 0, 8, 8).unwrap();
322        assert_eq!(header.client_id, 1);
323        assert_eq!(header.element_size, 16);
324        assert_eq!(header.max_nelmts_bits, 10);
325        assert_eq!(header.num_elements, 5);
326        assert_eq!(header.data_block_address, 0x1000);
327    }
328
329    #[test]
330    fn parse_fixed_array_header_invalid_signature() {
331        let mut buf = vec![0u8; 256];
332        buf[0..4].copy_from_slice(b"XXXX");
333        let result = FixedArrayHeader::parse(&buf, 0, 8, 8);
334        assert!(result.is_err());
335    }
336
337    #[test]
338    fn parse_fixed_array_header_invalid_version() {
339        let mut buf = vec![0u8; 256];
340        buf[0..4].copy_from_slice(b"FAHD");
341        buf[4] = 1; // unsupported version
342        let result = FixedArrayHeader::parse(&buf, 0, 8, 8);
343        assert!(result.is_err());
344    }
345
346    /// Build a synthetic Fixed Array (non-filtered) and verify reading.
347    #[test]
348    fn read_non_filtered_chunks() {
349        let offset_size: u8 = 8;
350        let length_size: u8 = 8;
351        let os = offset_size as usize;
352        let num_chunks = 5u64;
353
354        let mut file_data = vec![0u8; 0x3000];
355
356        // Build FAHD at offset 0x100
357        let fahd_offset = 0x100usize;
358        let db_offset = 0x200usize;
359        file_data[fahd_offset..fahd_offset + 4].copy_from_slice(b"FAHD");
360        file_data[fahd_offset + 4] = 0; // version
361        file_data[fahd_offset + 5] = 0; // client_id = non-filtered
362        file_data[fahd_offset + 6] = os as u8; // element_size = just address
363        file_data[fahd_offset + 7] = 10; // max_nelmts_bits
364        file_data[fahd_offset + 8..fahd_offset + 16].copy_from_slice(&num_chunks.to_le_bytes());
365        file_data[fahd_offset + 16..fahd_offset + 24]
366            .copy_from_slice(&(db_offset as u64).to_le_bytes());
367
368        // Build FADB at db_offset
369        file_data[db_offset..db_offset + 4].copy_from_slice(b"FADB");
370        file_data[db_offset + 4] = 0; // version
371        file_data[db_offset + 5] = 0; // client_id
372        file_data[db_offset + 6..db_offset + 14]
373            .copy_from_slice(&(fahd_offset as u64).to_le_bytes()); // header_address
374
375        // Elements: 5 addresses
376        let elem_start = db_offset + 6 + os;
377        let base_addr = 0x1000u64;
378        let chunk_byte_size = 20 * 8; // 20 elements × 8 bytes
379        for i in 0..5 {
380            let addr = base_addr + i as u64 * chunk_byte_size as u64;
381            let pos = elem_start + i * os;
382            file_data[pos..pos + os].copy_from_slice(&addr.to_le_bytes());
383        }
384
385        let header = FixedArrayHeader::parse(&file_data, fahd_offset, offset_size, length_size)
386            .unwrap();
387        let ds_dims = vec![100u64];
388        let chunk_dims = vec![20u32];
389        let chunks = read_fixed_array_chunks(
390            &file_data, &header, &ds_dims, &chunk_dims, 8, offset_size, length_size,
391        ).unwrap();
392
393        assert_eq!(chunks.len(), 5);
394        for (i, c) in chunks.iter().enumerate() {
395            assert_eq!(c.address, base_addr + i as u64 * chunk_byte_size as u64);
396            assert_eq!(c.offsets, vec![i as u64 * 20]);
397            assert_eq!(c.filter_mask, 0);
398            assert_eq!(c.chunk_size, chunk_byte_size as u32);
399        }
400    }
401
402    /// Build a synthetic Fixed Array (filtered) and verify reading.
403    #[test]
404    fn read_filtered_chunks() {
405        let offset_size: u8 = 8;
406        let length_size: u8 = 8;
407        let os = offset_size as usize;
408        let num_chunks = 3u64;
409        // element_size for filtered: offset_size + chunk_size_bytes + 4(filter_mask)
410        // chunk_size_bytes: let's use 4 bytes
411        let chunk_size_bytes = 4usize;
412        let elem_size = os + chunk_size_bytes + 4;
413
414        let mut file_data = vec![0u8; 0x3000];
415
416        let fahd_offset = 0x100usize;
417        let db_offset = 0x200usize;
418        file_data[fahd_offset..fahd_offset + 4].copy_from_slice(b"FAHD");
419        file_data[fahd_offset + 4] = 0;
420        file_data[fahd_offset + 5] = 1; // client_id = filtered
421        file_data[fahd_offset + 6] = elem_size as u8;
422        file_data[fahd_offset + 7] = 10;
423        file_data[fahd_offset + 8..fahd_offset + 16].copy_from_slice(&num_chunks.to_le_bytes());
424        file_data[fahd_offset + 16..fahd_offset + 24]
425            .copy_from_slice(&(db_offset as u64).to_le_bytes());
426
427        file_data[db_offset..db_offset + 4].copy_from_slice(b"FADB");
428        file_data[db_offset + 4] = 0;
429        file_data[db_offset + 5] = 1;
430        file_data[db_offset + 6..db_offset + 14]
431            .copy_from_slice(&(fahd_offset as u64).to_le_bytes());
432
433        let elem_start = db_offset + 6 + os;
434        let test_chunks = [
435            (0x1000u64, 120u32, 0u32),
436            (0x2000u64, 115u32, 0u32),
437            (0x3000u64, 100u32, 0u32),
438        ];
439
440        for (i, &(addr, csize, fmask)) in test_chunks.iter().enumerate() {
441            let pos = elem_start + i * elem_size;
442            file_data[pos..pos + os].copy_from_slice(&addr.to_le_bytes());
443            // chunk_size as 4 bytes LE
444            file_data[pos + os..pos + os + 4].copy_from_slice(&csize.to_le_bytes());
445            file_data[pos + os + 4..pos + os + 8].copy_from_slice(&fmask.to_le_bytes());
446        }
447
448        let header = FixedArrayHeader::parse(&file_data, fahd_offset, offset_size, length_size)
449            .unwrap();
450        let ds_dims = vec![60u64];
451        let chunk_dims = vec![20u32];
452        let chunks = read_fixed_array_chunks(
453            &file_data, &header, &ds_dims, &chunk_dims, 8, offset_size, length_size,
454        ).unwrap();
455
456        assert_eq!(chunks.len(), 3);
457        assert_eq!(chunks[0].address, 0x1000);
458        assert_eq!(chunks[0].chunk_size, 120);
459        assert_eq!(chunks[0].filter_mask, 0);
460        assert_eq!(chunks[0].offsets, vec![0]);
461        assert_eq!(chunks[1].address, 0x2000);
462        assert_eq!(chunks[1].chunk_size, 115);
463        assert_eq!(chunks[2].address, 0x3000);
464        assert_eq!(chunks[2].chunk_size, 100);
465    }
466}