Skip to main content

hdf5_pure/
reader.rs

1//! Reading API: File, Dataset, and Group handles for reading HDF5 files.
2
3use std::collections::HashMap;
4
5use crate::attribute::extract_attributes_full;
6use crate::chunk_cache::ChunkCache;
7use crate::data_layout::DataLayout;
8use crate::data_read;
9use crate::dataspace::Dataspace;
10use crate::datatype::Datatype;
11use crate::error::{Error, FormatError};
12use crate::filter_pipeline::FilterPipeline;
13use crate::group_v1::GroupEntry;
14use crate::group_v2;
15use crate::message_type::MessageType;
16use crate::object_header::ObjectHeader;
17use crate::signature;
18use crate::superblock::Superblock;
19
20use crate::types::{attrs_to_map, classify_datatype, AttrValue, DType};
21
22// ---------------------------------------------------------------------------
23// File
24// ---------------------------------------------------------------------------
25
26/// An open HDF5 file for reading.
27pub struct File {
28    data: Vec<u8>,
29    superblock: Superblock,
30    chunk_cache: ChunkCache,
31    /// Byte offset to add to all relative addresses (= original base_address).
32    addr_offset: u64,
33}
34
35impl File {
36    /// Open an HDF5 file from a filesystem path.
37    pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self, Error> {
38        let bytes = std::fs::read(path.as_ref()).map_err(Error::Io)?;
39        Self::from_bytes(bytes)
40    }
41
42    /// Open an HDF5 file from an in-memory byte vector.
43    pub fn from_bytes(data: Vec<u8>) -> Result<Self, Error> {
44        let sig_offset = signature::find_signature(&data)?;
45        let mut superblock = Superblock::parse(&data, sig_offset)?;
46        let addr_offset = superblock.base_address;
47        // Normalize root_group_address to absolute so resolve_path_any works.
48        superblock.root_group_address += addr_offset;
49        Ok(Self {
50            data,
51            superblock,
52            chunk_cache: ChunkCache::new(),
53            addr_offset,
54        })
55    }
56
57    /// Returns a handle to the root group.
58    pub fn root(&self) -> Group<'_> {
59        Group {
60            file: self,
61            // root_group_address was normalized to absolute in from_bytes()
62            address: self.superblock.root_group_address,
63        }
64    }
65
66    /// Resolve a path and return a `Dataset` handle.
67    pub fn dataset(&self, path: &str) -> Result<Dataset<'_>, Error> {
68        let addr = group_v2::resolve_path_any(&self.data, &self.superblock, path)?;
69        let hdr = self.parse_header(addr)?;
70        if !has_message(&hdr, MessageType::DataLayout) {
71            return Err(Error::NotADataset(path.to_string()));
72        }
73        Ok(Dataset {
74            file: self,
75            header: hdr,
76        })
77    }
78
79    /// Resolve a path and return a `Group` handle.
80    pub fn group(&self, path: &str) -> Result<Group<'_>, Error> {
81        let addr = group_v2::resolve_path_any(&self.data, &self.superblock, path)?;
82        Ok(Group {
83            file: self,
84            address: addr,
85        })
86    }
87
88    /// Returns the raw file bytes.
89    pub fn as_bytes(&self) -> &[u8] {
90        &self.data
91    }
92
93    /// Returns a reference to the parsed superblock.
94    pub fn superblock(&self) -> &Superblock {
95        &self.superblock
96    }
97
98    fn parse_header(&self, address: u64) -> Result<ObjectHeader, FormatError> {
99        ObjectHeader::parse_with_base(
100            &self.data,
101            address as usize,
102            self.superblock.offset_size,
103            self.superblock.length_size,
104            self.addr_offset,
105        )
106    }
107
108    fn offset_size(&self) -> u8 {
109        self.superblock.offset_size
110    }
111
112    fn length_size(&self) -> u8 {
113        self.superblock.length_size
114    }
115}
116
117impl std::fmt::Debug for File {
118    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
119        f.debug_struct("File")
120            .field("size", &self.data.len())
121            .field("superblock_version", &self.superblock.version)
122            .finish()
123    }
124}
125
126// ---------------------------------------------------------------------------
127// Group handle
128// ---------------------------------------------------------------------------
129
130/// A lightweight handle to an HDF5 group.
131pub struct Group<'f> {
132    file: &'f File,
133    address: u64,
134}
135
136impl<'f> Group<'f> {
137    /// List the names of datasets in this group.
138    pub fn datasets(&self) -> Result<Vec<String>, Error> {
139        let entries = self.children()?;
140        let mut names = Vec::new();
141        for entry in &entries {
142            let hdr = self.file.parse_header(entry.object_header_address)?;
143            if has_message(&hdr, MessageType::DataLayout) {
144                names.push(entry.name.clone());
145            }
146        }
147        Ok(names)
148    }
149
150    /// List the names of subgroups in this group.
151    pub fn groups(&self) -> Result<Vec<String>, Error> {
152        let entries = self.children()?;
153        let mut names = Vec::new();
154        for entry in &entries {
155            let hdr = self.file.parse_header(entry.object_header_address)?;
156            if is_group(&hdr) {
157                names.push(entry.name.clone());
158            }
159        }
160        Ok(names)
161    }
162
163    /// Read all attributes of this group.
164    pub fn attrs(&self) -> Result<HashMap<String, AttrValue>, Error> {
165        let hdr = self.file.parse_header(self.address)?;
166        let attr_msgs = extract_attributes_full(
167            &self.file.data,
168            &hdr,
169            self.file.offset_size(),
170            self.file.length_size(),
171        )?;
172        Ok(attrs_to_map(
173            &attr_msgs,
174            &self.file.data,
175            self.file.offset_size(),
176            self.file.length_size(),
177        ))
178    }
179
180    /// Get a dataset within this group by name.
181    pub fn dataset(&self, name: &str) -> Result<Dataset<'f>, Error> {
182        let entries = self.children()?;
183        let entry = entries
184            .iter()
185            .find(|e| e.name == name)
186            .ok_or_else(|| Error::Format(FormatError::PathNotFound(name.to_string())))?;
187        let hdr = self.file.parse_header(entry.object_header_address)?;
188        if !has_message(&hdr, MessageType::DataLayout) {
189            return Err(Error::NotADataset(name.to_string()));
190        }
191        Ok(Dataset {
192            file: self.file,
193            header: hdr,
194        })
195    }
196
197    /// Get a subgroup within this group by name.
198    pub fn group(&self, name: &str) -> Result<Group<'f>, Error> {
199        let entries = self.children()?;
200        let entry = entries
201            .iter()
202            .find(|e| e.name == name)
203            .ok_or_else(|| Error::Format(FormatError::PathNotFound(name.to_string())))?;
204        Ok(Group {
205            file: self.file,
206            address: entry.object_header_address,
207        })
208    }
209
210    fn children(&self) -> Result<Vec<GroupEntry>, Error> {
211        let hdr = self.file.parse_header(self.address)?;
212        let os = self.file.offset_size();
213        let ls = self.file.length_size();
214        let base = self.file.addr_offset;
215        let mut entries = group_v2::resolve_group_entries(&self.file.data, &hdr, os, ls, base)
216            .map_err(Error::Format)?;
217        // Convert link addresses from relative to absolute
218        for entry in &mut entries {
219            entry.object_header_address += base;
220        }
221        Ok(entries)
222    }
223}
224
225// ---------------------------------------------------------------------------
226// Dataset handle
227// ---------------------------------------------------------------------------
228
229/// A lightweight handle to an HDF5 dataset.
230#[derive(Debug)]
231pub struct Dataset<'f> {
232    file: &'f File,
233    header: ObjectHeader,
234}
235
236impl<'f> Dataset<'f> {
237    /// Returns the shape (dimensions) of the dataset.
238    pub fn shape(&self) -> Result<Vec<u64>, Error> {
239        let ds = self.dataspace()?;
240        Ok(ds.dimensions.clone())
241    }
242
243    /// Returns the simplified datatype of the dataset.
244    pub fn dtype(&self) -> Result<DType, Error> {
245        let dt = self.datatype()?;
246        Ok(classify_datatype(&dt))
247    }
248
249    /// Read all data as `f64` values.
250    pub fn read_f64(&self) -> Result<Vec<f64>, Error> {
251        let raw = self.read_raw()?;
252        let dt = self.datatype()?;
253        Ok(data_read::read_as_f64(&raw, &dt)?)
254    }
255
256    /// Read all data as `f32` values.
257    pub fn read_f32(&self) -> Result<Vec<f32>, Error> {
258        let raw = self.read_raw()?;
259        let dt = self.datatype()?;
260        Ok(data_read::read_as_f32(&raw, &dt)?)
261    }
262
263    /// Read all data as `i32` values.
264    pub fn read_i32(&self) -> Result<Vec<i32>, Error> {
265        let raw = self.read_raw()?;
266        let dt = self.datatype()?;
267        Ok(data_read::read_as_i32(&raw, &dt)?)
268    }
269
270    /// Read all data as `i64` values.
271    pub fn read_i64(&self) -> Result<Vec<i64>, Error> {
272        let raw = self.read_raw()?;
273        let dt = self.datatype()?;
274        Ok(data_read::read_as_i64(&raw, &dt)?)
275    }
276
277    /// Read all data as `u64` values.
278    pub fn read_u64(&self) -> Result<Vec<u64>, Error> {
279        let raw = self.read_raw()?;
280        let dt = self.datatype()?;
281        Ok(data_read::read_as_u64(&raw, &dt)?)
282    }
283
284    /// Read all data as `u8` values.
285    pub fn read_u8(&self) -> Result<Vec<u8>, Error> {
286        self.read_raw()
287    }
288
289    /// Read all data as `i8` values.
290    pub fn read_i8(&self) -> Result<Vec<i8>, Error> {
291        let raw = self.read_raw()?;
292        Ok(raw.iter().map(|&b| b as i8).collect())
293    }
294
295    /// Read all data as `i16` values.
296    pub fn read_i16(&self) -> Result<Vec<i16>, Error> {
297        let raw = self.read_raw()?;
298        let dt = self.datatype()?;
299        let vals = data_read::read_as_i32(&raw, &dt)?;
300        Ok(vals.into_iter().map(|v| v as i16).collect())
301    }
302
303    /// Read all data as `u16` values.
304    pub fn read_u16(&self) -> Result<Vec<u16>, Error> {
305        let raw = self.read_raw()?;
306        let dt = self.datatype()?;
307        let vals = data_read::read_as_u64(&raw, &dt)?;
308        Ok(vals.into_iter().map(|v| v as u16).collect())
309    }
310
311    /// Read all data as `u32` values.
312    pub fn read_u32(&self) -> Result<Vec<u32>, Error> {
313        let raw = self.read_raw()?;
314        let dt = self.datatype()?;
315        let vals = data_read::read_as_u64(&raw, &dt)?;
316        Ok(vals.into_iter().map(|v| v as u32).collect())
317    }
318
319    /// Read all data as `String` values.
320    pub fn read_string(&self) -> Result<Vec<String>, Error> {
321        let raw = self.read_raw()?;
322        let dt = self.datatype()?;
323        Ok(data_read::read_as_strings(&raw, &dt)?)
324    }
325
326    /// Read all attributes of this dataset.
327    pub fn attrs(&self) -> Result<HashMap<String, AttrValue>, Error> {
328        let attr_msgs = extract_attributes_full(
329            &self.file.data,
330            &self.header,
331            self.file.offset_size(),
332            self.file.length_size(),
333        )?;
334        Ok(attrs_to_map(
335            &attr_msgs,
336            &self.file.data,
337            self.file.offset_size(),
338            self.file.length_size(),
339        ))
340    }
341
342    fn datatype(&self) -> Result<Datatype, Error> {
343        let msg = find_message(&self.header, MessageType::Datatype)?;
344        let (dt, _) = Datatype::parse(&msg.data)?;
345        Ok(dt)
346    }
347
348    fn dataspace(&self) -> Result<Dataspace, Error> {
349        let msg = find_message(&self.header, MessageType::Dataspace)?;
350        Ok(Dataspace::parse(&msg.data, self.file.length_size())?)
351    }
352
353    fn data_layout(&self) -> Result<DataLayout, Error> {
354        let msg = find_message(&self.header, MessageType::DataLayout)?;
355        Ok(DataLayout::parse(
356            &msg.data,
357            self.file.offset_size(),
358            self.file.length_size(),
359        )?)
360    }
361
362    fn filter_pipeline(&self) -> Option<FilterPipeline> {
363        self.header
364            .messages
365            .iter()
366            .find(|m| m.msg_type == MessageType::FilterPipeline)
367            .and_then(|msg| FilterPipeline::parse(&msg.data).ok())
368    }
369
370    fn read_raw(&self) -> Result<Vec<u8>, Error> {
371        let dt = self.datatype()?;
372        let ds = self.dataspace()?;
373        let mut dl = self.data_layout()?;
374        // Adjust contiguous data address by base_address offset
375        if self.file.addr_offset != 0 {
376            if let DataLayout::Contiguous { ref mut address, .. } = dl {
377                if let Some(addr) = address {
378                    *addr += self.file.addr_offset;
379                }
380            }
381        }
382        let pipeline = self.filter_pipeline();
383        Ok(data_read::read_raw_data_cached(
384            &self.file.data,
385            &dl,
386            &ds,
387            &dt,
388            pipeline.as_ref(),
389            self.file.offset_size(),
390            self.file.length_size(),
391            &self.file.chunk_cache,
392        )?)
393    }
394}
395
396// ---------------------------------------------------------------------------
397// Helpers
398// ---------------------------------------------------------------------------
399
400fn find_message(
401    header: &ObjectHeader,
402    msg_type: MessageType,
403) -> Result<&crate::object_header::HeaderMessage, Error> {
404    header
405        .messages
406        .iter()
407        .find(|m| m.msg_type == msg_type)
408        .ok_or(Error::MissingMessage(msg_type))
409}
410
411fn has_message(header: &ObjectHeader, msg_type: MessageType) -> bool {
412    header.messages.iter().any(|m| m.msg_type == msg_type)
413}
414
415fn is_group(header: &ObjectHeader) -> bool {
416    header
417        .messages
418        .iter()
419        .any(|m| m.msg_type == MessageType::LinkInfo
420            || m.msg_type == MessageType::Link
421            || m.msg_type == MessageType::SymbolTable)
422}
423