Skip to main content

async_hdf5/messages/
attribute.rs

1use std::sync::Arc;
2
3use bytes::Bytes;
4
5use crate::endian::HDF5Reader;
6use crate::error::Result;
7use crate::heap;
8use crate::messages::dataspace::DataspaceMessage;
9use crate::messages::datatype::{ByteOrder, DataType, StringPadding};
10use crate::reader::AsyncFileReader;
11
12/// A decoded HDF5 attribute value.
13#[derive(Debug, Clone, PartialEq)]
14pub enum AttributeValue {
15    /// Signed 8-bit integer (scalar or array).
16    I8(Vec<i8>),
17    /// Signed 16-bit integer (scalar or array).
18    I16(Vec<i16>),
19    /// Signed 32-bit integer (scalar or array).
20    I32(Vec<i32>),
21    /// Signed 64-bit integer (scalar or array).
22    I64(Vec<i64>),
23    /// Unsigned 8-bit integer (scalar or array).
24    U8(Vec<u8>),
25    /// Unsigned 16-bit integer (scalar or array).
26    U16(Vec<u16>),
27    /// Unsigned 32-bit integer (scalar or array).
28    U32(Vec<u32>),
29    /// Unsigned 64-bit integer (scalar or array).
30    U64(Vec<u64>),
31    /// 32-bit float (scalar or array).
32    F32(Vec<f32>),
33    /// 64-bit float (scalar or array).
34    F64(Vec<f64>),
35    /// Fixed-length string.
36    String(String),
37    /// Raw bytes (for types we don't decode).
38    Raw(Vec<u8>),
39}
40
41impl AttributeValue {
42    /// Returns true if this is a scalar (single-element) value.
43    pub fn is_scalar(&self) -> bool {
44        match self {
45            AttributeValue::I8(v) => v.len() == 1,
46            AttributeValue::I16(v) => v.len() == 1,
47            AttributeValue::I32(v) => v.len() == 1,
48            AttributeValue::I64(v) => v.len() == 1,
49            AttributeValue::U8(v) => v.len() == 1,
50            AttributeValue::U16(v) => v.len() == 1,
51            AttributeValue::U32(v) => v.len() == 1,
52            AttributeValue::U64(v) => v.len() == 1,
53            AttributeValue::F32(v) => v.len() == 1,
54            AttributeValue::F64(v) => v.len() == 1,
55            AttributeValue::String(_) => true,
56            AttributeValue::Raw(_) => true,
57        }
58    }
59
60    /// Try to get a scalar i32 value.
61    pub fn as_i32(&self) -> Option<i32> {
62        match self {
63            AttributeValue::I32(v) if v.len() == 1 => Some(v[0]),
64            _ => None,
65        }
66    }
67
68    /// Try to get a scalar i64 value.
69    pub fn as_i64(&self) -> Option<i64> {
70        match self {
71            AttributeValue::I64(v) if v.len() == 1 => Some(v[0]),
72            _ => None,
73        }
74    }
75
76    /// Try to get a scalar f32 value.
77    pub fn as_f32(&self) -> Option<f32> {
78        match self {
79            AttributeValue::F32(v) if v.len() == 1 => Some(v[0]),
80            _ => None,
81        }
82    }
83
84    /// Try to get a scalar f64 value.
85    pub fn as_f64(&self) -> Option<f64> {
86        match self {
87            AttributeValue::F64(v) if v.len() == 1 => Some(v[0]),
88            _ => None,
89        }
90    }
91
92    /// Try to get a string value.
93    pub fn as_str(&self) -> Option<&str> {
94        match self {
95            AttributeValue::String(s) => Some(s),
96            _ => None,
97        }
98    }
99}
100
101/// A fully resolved HDF5 attribute: name + decoded value.
102#[derive(Debug, Clone)]
103pub struct Attribute {
104    /// Attribute name.
105    pub name: String,
106    /// Decoded value.
107    pub value: AttributeValue,
108}
109
110/// An HDF5 attribute (name-value pair attached to a group or dataset).
111///
112/// Message type 0x000C.
113#[derive(Debug, Clone)]
114pub struct AttributeMessage {
115    /// Attribute name.
116    pub name: String,
117    /// Data type of the attribute value.
118    pub dtype: DataType,
119    /// Dataspace (dimensionality) of the attribute.
120    pub dataspace: DataspaceMessage,
121    /// Raw value bytes.
122    pub raw_value: Bytes,
123}
124
125impl AttributeMessage {
126    /// Decode the raw value bytes into a typed `AttributeValue`.
127    pub fn decode(&self) -> AttributeValue {
128        let n = num_elements(&self.dataspace.dimensions) as usize;
129        let raw = &self.raw_value;
130
131        match &self.dtype {
132            DataType::FixedPoint {
133                size,
134                signed,
135                byte_order,
136                ..
137            } => decode_fixed_point(raw, *size, *signed, *byte_order, n),
138
139            DataType::FloatingPoint {
140                size, byte_order, ..
141            } => decode_floating_point(raw, *size, *byte_order, n),
142
143            DataType::String { size, padding, .. } => {
144                let s = if *size == 0 {
145                    // Zero-size means the string is empty or this is a vlen string
146                    String::new()
147                } else {
148                    let end = (*size as usize).min(raw.len());
149                    let bytes = &raw[..end];
150                    let s = String::from_utf8_lossy(bytes);
151                    match padding {
152                        StringPadding::NullTerminate => {
153                            s.split('\0').next().unwrap_or("").to_string()
154                        }
155                        StringPadding::NullPad => s.trim_end_matches('\0').to_string(),
156                        StringPadding::SpacePad => s.trim_end().to_string(),
157                    }
158                };
159                AttributeValue::String(s)
160            }
161
162            // Enum with 1-byte base → treat as bool (h5py convention: FALSE=0, TRUE=1)
163            DataType::Enum { base_type, .. } => {
164                // Decode as the base type
165                match base_type.as_ref() {
166                    DataType::FixedPoint {
167                        size,
168                        signed,
169                        byte_order,
170                        ..
171                    } => decode_fixed_point(raw, *size, *signed, *byte_order, n),
172                    _ => AttributeValue::Raw(raw.to_vec()),
173                }
174            }
175
176            _ => AttributeValue::Raw(raw.to_vec()),
177        }
178    }
179
180    /// Convert this message to a resolved `Attribute` with decoded value.
181    pub fn to_attribute(&self) -> Attribute {
182        Attribute {
183            name: self.name.clone(),
184            value: self.decode(),
185        }
186    }
187
188    /// Decode the raw value, resolving variable-length data via the global heap.
189    ///
190    /// This handles the common case of vlen strings written by h5py (the default).
191    /// Falls back to `decode()` for non-vlen types.
192    pub async fn decode_with_reader(
193        &self,
194        reader: &Arc<dyn AsyncFileReader>,
195        size_of_offsets: u8,
196        size_of_lengths: u8,
197    ) -> Result<AttributeValue> {
198        match &self.dtype {
199            DataType::VarLen {
200                is_string: true, ..
201            } => {
202                // Vlen string: raw_value = length(4) + collection_addr(size_of_offsets) + object_index(4)
203                let raw = &self.raw_value;
204                if raw.len() < 4 + size_of_offsets as usize + 4 {
205                    return Ok(AttributeValue::String(String::new()));
206                }
207                let mut r = HDF5Reader::with_sizes(
208                    self.raw_value.clone(),
209                    size_of_offsets,
210                    size_of_lengths,
211                );
212                let _seq_len = r.read_u32()?;
213                let collection_addr = r.read_offset()?;
214                let object_index = r.read_u32()?;
215
216                // Undefined address means empty/null
217                if HDF5Reader::is_undef_addr(collection_addr, size_of_offsets) {
218                    return Ok(AttributeValue::String(String::new()));
219                }
220
221                let obj_data = heap::global::read_global_heap_object(
222                    reader,
223                    collection_addr,
224                    object_index,
225                    size_of_offsets,
226                    size_of_lengths,
227                )
228                .await?;
229
230                let s = String::from_utf8_lossy(&obj_data);
231                // Vlen strings are null-terminated by convention
232                let s = s.split('\0').next().unwrap_or("").to_string();
233                Ok(AttributeValue::String(s))
234            }
235            _ => Ok(self.decode()),
236        }
237    }
238
239    /// Async version of `to_attribute` that resolves vlen data.
240    pub async fn to_attribute_resolved(
241        &self,
242        reader: &Arc<dyn AsyncFileReader>,
243        size_of_offsets: u8,
244        size_of_lengths: u8,
245    ) -> Result<Attribute> {
246        let value = self
247            .decode_with_reader(reader, size_of_offsets, size_of_lengths)
248            .await?;
249        Ok(Attribute {
250            name: self.name.clone(),
251            value,
252        })
253    }
254
255    /// Parse from the raw message bytes.
256    pub fn parse(data: &Bytes, size_of_offsets: u8, size_of_lengths: u8) -> Result<Self> {
257        let mut r = HDF5Reader::with_sizes(data.clone(), size_of_offsets, size_of_lengths);
258
259        let version = r.read_u8()?;
260
261        match version {
262            1 => Self::parse_v1(&mut r, data, size_of_offsets, size_of_lengths),
263            2 => Self::parse_v2(&mut r, data, size_of_offsets, size_of_lengths),
264            3 => Self::parse_v3(&mut r, data, size_of_offsets, size_of_lengths),
265            _ => {
266                // Best effort — return minimal
267                Ok(Self {
268                    name: String::new(),
269                    dtype: DataType::Opaque {
270                        size: 0,
271                        tag: String::new(),
272                    },
273                    dataspace: DataspaceMessage {
274                        rank: 0,
275                        dataspace_type: 0,
276                        dimensions: vec![],
277                        max_dimensions: None,
278                    },
279                    raw_value: Bytes::new(),
280                })
281            }
282        }
283    }
284
285    fn parse_v1(
286        r: &mut HDF5Reader,
287        data: &Bytes,
288        _size_of_offsets: u8,
289        size_of_lengths: u8,
290    ) -> Result<Self> {
291        r.skip(1); // reserved
292        let name_size = r.read_u16()? as usize;
293        let datatype_size = r.read_u16()? as usize;
294        let dataspace_size = r.read_u16()? as usize;
295
296        // Name (padded to 8-byte boundary)
297        let name_bytes = r.read_bytes(name_size)?;
298        let name = String::from_utf8_lossy(&name_bytes)
299            .trim_end_matches('\0')
300            .to_string();
301        r.skip_field_padding(name_size, 8);
302
303        let (dtype, dataspace, raw_value) = parse_dtype_dataspace_value(
304            r,
305            data,
306            datatype_size,
307            dataspace_size,
308            size_of_lengths,
309            true,
310        )?;
311
312        Ok(Self {
313            name,
314            dtype,
315            dataspace,
316            raw_value,
317        })
318    }
319
320    fn parse_v2(
321        r: &mut HDF5Reader,
322        data: &Bytes,
323        _size_of_offsets: u8,
324        size_of_lengths: u8,
325    ) -> Result<Self> {
326        // v2: same as v1 but no padding on name/dt/ds
327        let _flags = r.read_u8()?;
328        let name_size = r.read_u16()? as usize;
329        let datatype_size = r.read_u16()? as usize;
330        let dataspace_size = r.read_u16()? as usize;
331
332        let name_bytes = r.read_bytes(name_size)?;
333        let name = String::from_utf8_lossy(&name_bytes)
334            .trim_end_matches('\0')
335            .to_string();
336
337        let (dtype, dataspace, raw_value) = parse_dtype_dataspace_value(
338            r,
339            data,
340            datatype_size,
341            dataspace_size,
342            size_of_lengths,
343            false,
344        )?;
345
346        Ok(Self {
347            name,
348            dtype,
349            dataspace,
350            raw_value,
351        })
352    }
353
354    fn parse_v3(
355        r: &mut HDF5Reader,
356        data: &Bytes,
357        _size_of_offsets: u8,
358        size_of_lengths: u8,
359    ) -> Result<Self> {
360        // v3: same as v2 but with charset flag
361        let flags = r.read_u8()?;
362        let name_size = r.read_u16()? as usize;
363        let datatype_size = r.read_u16()? as usize;
364        let dataspace_size = r.read_u16()? as usize;
365
366        let _charset = if flags & 0x10 != 0 { r.read_u8()? } else { 0 };
367
368        let name_bytes = r.read_bytes(name_size)?;
369        let name = String::from_utf8_lossy(&name_bytes)
370            .trim_end_matches('\0')
371            .to_string();
372
373        let (dtype, dataspace, raw_value) = parse_dtype_dataspace_value(
374            r,
375            data,
376            datatype_size,
377            dataspace_size,
378            size_of_lengths,
379            false,
380        )?;
381
382        Ok(Self {
383            name,
384            dtype,
385            dataspace,
386            raw_value,
387        })
388    }
389}
390
391/// Compute the total number of elements from dataspace dimensions using
392/// saturating arithmetic to avoid overflow panics on malformed data.
393fn num_elements(dimensions: &[u64]) -> u64 {
394    dimensions
395        .iter()
396        .copied()
397        .fold(1u64, |acc, d| acc.saturating_mul(d))
398        .max(1)
399}
400
401/// Extract the raw value bytes at the current reader position, using the
402/// dataspace dimensions and dtype size to compute how many bytes to read.
403fn extract_raw_value(
404    r: &HDF5Reader,
405    data: &Bytes,
406    dataspace: &DataspaceMessage,
407    dtype: &DataType,
408) -> Bytes {
409    let n = num_elements(&dataspace.dimensions);
410    let value_size = n.saturating_mul(dtype.size() as u64) as usize;
411    let val_start = r.position() as usize;
412    if val_start + value_size <= data.len() {
413        data.slice(val_start..val_start + value_size)
414    } else {
415        Bytes::new()
416    }
417}
418
419/// Parse the datatype, dataspace, and raw value from the current reader position.
420/// Used by all three attribute message versions. When `pad_to_8` is true (v1),
421/// each section is padded to an 8-byte boundary.
422fn parse_dtype_dataspace_value(
423    r: &mut HDF5Reader,
424    data: &Bytes,
425    datatype_size: usize,
426    dataspace_size: usize,
427    size_of_lengths: u8,
428    pad_to_8: bool,
429) -> Result<(DataType, DataspaceMessage, Bytes)> {
430    let dt_start = r.position() as usize;
431    let dt_bytes = data.slice(dt_start..dt_start + datatype_size);
432    let dtype = DataType::parse(&dt_bytes)?;
433    r.skip(datatype_size as u64);
434    if pad_to_8 {
435        r.skip_field_padding(datatype_size, 8);
436    }
437
438    let ds_start = r.position() as usize;
439    let ds_bytes = data.slice(ds_start..ds_start + dataspace_size);
440    let dataspace = DataspaceMessage::parse(&ds_bytes, size_of_lengths)?;
441    r.skip(dataspace_size as u64);
442    if pad_to_8 {
443        r.skip_field_padding(dataspace_size, 8);
444    }
445
446    let raw_value = extract_raw_value(r, data, &dataspace, &dtype);
447    Ok((dtype, dataspace, raw_value))
448}
449
450/// Decode a slice of raw bytes into a `Vec<$ty>`, interpreting each `$width`-byte
451/// chunk as either little-endian or big-endian.
452macro_rules! decode_numeric {
453    ($raw:expr, $n:expr, $is_le:expr, $width:expr, $ty:ty) => {{
454        $raw.chunks_exact($width)
455            .take($n)
456            .map(|c| {
457                let arr: [u8; $width] = c.try_into().unwrap();
458                if $is_le {
459                    <$ty>::from_le_bytes(arr)
460                } else {
461                    <$ty>::from_be_bytes(arr)
462                }
463            })
464            .collect::<Vec<$ty>>()
465    }};
466}
467
468/// Decode fixed-point (integer) raw bytes into an `AttributeValue`.
469fn decode_fixed_point(
470    raw: &[u8],
471    size: u32,
472    signed: bool,
473    byte_order: ByteOrder,
474    n: usize,
475) -> AttributeValue {
476    let is_le = matches!(byte_order, ByteOrder::LittleEndian);
477    match (size, signed) {
478        (1, true) => AttributeValue::I8(raw.iter().take(n).map(|&b| b as i8).collect()),
479        (1, false) => AttributeValue::U8(raw.iter().take(n).copied().collect()),
480        (2, true) => AttributeValue::I16(decode_numeric!(raw, n, is_le, 2, i16)),
481        (2, false) => AttributeValue::U16(decode_numeric!(raw, n, is_le, 2, u16)),
482        (4, true) => AttributeValue::I32(decode_numeric!(raw, n, is_le, 4, i32)),
483        (4, false) => AttributeValue::U32(decode_numeric!(raw, n, is_le, 4, u32)),
484        (8, true) => AttributeValue::I64(decode_numeric!(raw, n, is_le, 8, i64)),
485        (8, false) => AttributeValue::U64(decode_numeric!(raw, n, is_le, 8, u64)),
486        _ => AttributeValue::Raw(raw.to_vec()),
487    }
488}
489
490/// Decode floating-point raw bytes into an `AttributeValue`.
491fn decode_floating_point(raw: &[u8], size: u32, byte_order: ByteOrder, n: usize) -> AttributeValue {
492    let is_le = matches!(byte_order, ByteOrder::LittleEndian);
493    match size {
494        4 => AttributeValue::F32(decode_numeric!(raw, n, is_le, 4, f32)),
495        8 => AttributeValue::F64(decode_numeric!(raw, n, is_le, 8, f64)),
496        _ => AttributeValue::Raw(raw.to_vec()),
497    }
498}