Skip to main content

hdf5_reader/
attribute_api.rs

1use crate::error::{Error, Result};
2use crate::fractal_heap::FractalHeap;
3use crate::global_heap::GlobalHeapCollection;
4use crate::io::Cursor;
5use crate::messages::attribute::AttributeMessage;
6use crate::messages::attribute_info::AttributeInfoMessage;
7use crate::messages::dataspace::DataspaceType;
8use crate::messages::datatype::{Datatype, StringEncoding, StringPadding, StringSize};
9use crate::messages::HdfMessage;
10use crate::object_header::ObjectHeader;
11use crate::{btree_v2, messages};
12
13/// A parsed, high-level HDF5 attribute.
14#[derive(Debug, Clone)]
15pub struct Attribute {
16    pub name: String,
17    pub datatype: Datatype,
18    pub shape: Vec<u64>,
19    pub raw_data: Vec<u8>,
20}
21
22impl Attribute {
23    /// Create from a parsed attribute message.
24    pub fn from_message(msg: AttributeMessage) -> Self {
25        Self::from_message_with_context(msg, None, 0)
26    }
27
28    /// Create from a parsed attribute message with optional file context for
29    /// resolving variable-length byte attributes stored in the global heap.
30    pub fn from_message_with_context(
31        msg: AttributeMessage,
32        file_data: Option<&[u8]>,
33        offset_size: u8,
34    ) -> Self {
35        let shape = match msg.dataspace.dataspace_type {
36            DataspaceType::Scalar => vec![],
37            DataspaceType::Null => vec![0],
38            DataspaceType::Simple => msg.dataspace.dims.clone(),
39        };
40        let raw_data =
41            if let (Some(file_data), Datatype::VarLen { base }) = (file_data, &msg.datatype) {
42                if is_byte_vlen(base) && shape.is_empty() {
43                    resolve_vlen_bytes(&msg.raw_data, file_data, offset_size)
44                        .unwrap_or_else(|| msg.raw_data.clone())
45                } else {
46                    msg.raw_data.clone()
47                }
48            } else {
49                msg.raw_data.clone()
50            };
51        Attribute {
52            name: msg.name,
53            datatype: msg.datatype,
54            shape,
55            raw_data,
56        }
57    }
58
59    /// Total number of elements.
60    pub fn num_elements(&self) -> u64 {
61        if self.shape.is_empty() {
62            1 // scalar
63        } else {
64            self.shape.iter().product()
65        }
66    }
67
68    /// Read the attribute value as a scalar of the given type.
69    pub fn read_scalar<T: crate::datatype_api::H5Type>(&self) -> Result<T> {
70        T::from_bytes(&self.raw_data, &self.datatype)
71    }
72
73    /// Read the attribute as a 1-D vector of the given type.
74    pub fn read_1d<T: crate::datatype_api::H5Type>(&self) -> Result<Vec<T>> {
75        let elem_size = T::element_size(&self.datatype);
76        let n = self.num_elements() as usize;
77        let mut result = Vec::with_capacity(n);
78        for i in 0..n {
79            let start = i * elem_size;
80            let end = start + elem_size;
81            if end > self.raw_data.len() {
82                return Err(Error::InvalidData(format!(
83                    "attribute data too short: need {} bytes, have {}",
84                    end,
85                    self.raw_data.len()
86                )));
87            }
88            result.push(T::from_bytes(&self.raw_data[start..end], &self.datatype)?);
89        }
90        Ok(result)
91    }
92
93    /// Read the attribute as a string (for string-typed attributes).
94    ///
95    /// For variable-length strings, use `read_vlen_string()` with the file data
96    /// and offset_size — this method will return an error directing you there.
97    pub fn read_string(&self) -> Result<String> {
98        match &self.datatype {
99            Datatype::VarLen { base } if is_byte_vlen(base) => {
100                decode_varlen_byte_string(&self.raw_data)
101            }
102            Datatype::String {
103                size,
104                encoding,
105                padding,
106            } => match size {
107                StringSize::Fixed(len) => {
108                    let len = *len as usize;
109                    let bytes = if self.raw_data.len() < len {
110                        &self.raw_data
111                    } else {
112                        &self.raw_data[..len]
113                    };
114                    decode_string(bytes, *padding, *encoding)
115                }
116                StringSize::Variable => {
117                    // For inline vlen strings in attributes, try direct decode.
118                    // If it looks like a global heap reference (>= 12 bytes for
119                    // seq_len + addr + index), suggest read_vlen_string instead.
120                    if self.raw_data.len() >= 12 {
121                        // Try to decode directly first — some files inline the string
122                        let trimmed = match padding {
123                            StringPadding::NullTerminate => {
124                                let end = self
125                                    .raw_data
126                                    .iter()
127                                    .position(|&b| b == 0)
128                                    .unwrap_or(self.raw_data.len());
129                                &self.raw_data[..end]
130                            }
131                            _ => &self.raw_data,
132                        };
133                        if let Ok(s) = String::from_utf8(trimmed.to_vec()) {
134                            if s.chars()
135                                .all(|c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
136                            {
137                                return Ok(s);
138                            }
139                        }
140                    }
141                    decode_string(&self.raw_data, *padding, *encoding)
142                }
143            },
144            _ => Err(Error::TypeMismatch {
145                expected: "String".into(),
146                actual: format!("{:?}", self.datatype),
147            }),
148        }
149    }
150
151    /// Read a variable-length string attribute from the global heap.
152    ///
153    /// Variable-length strings in HDF5 are stored as references into a global
154    /// heap collection. Each reference is: `seq_len(u32) + heap_addr(offset_size) + index(u32)`.
155    pub fn read_vlen_string(&self, file_data: &[u8], offset_size: u8) -> Result<String> {
156        match &self.datatype {
157            Datatype::String {
158                size: StringSize::Variable,
159                encoding,
160                padding,
161            } => {
162                let ref_size = 4 + offset_size as usize + 4; // seq_len + addr + index
163                if self.raw_data.len() < ref_size {
164                    // Fallback: try direct decode
165                    return decode_string(&self.raw_data, *padding, *encoding);
166                }
167                let bytes = read_one_vlen_string(
168                    &self.raw_data,
169                    0,
170                    file_data,
171                    offset_size,
172                    *padding,
173                    *encoding,
174                )?;
175                Ok(bytes)
176            }
177            Datatype::String {
178                size: StringSize::Fixed(_),
179                ..
180            } => self.read_string(),
181            _ => Err(Error::TypeMismatch {
182                expected: "String".into(),
183                actual: format!("{:?}", self.datatype),
184            }),
185        }
186    }
187
188    /// Read an array of variable-length strings from the global heap.
189    pub fn read_vlen_strings(&self, file_data: &[u8], offset_size: u8) -> Result<Vec<String>> {
190        match &self.datatype {
191            Datatype::String {
192                size: StringSize::Variable,
193                encoding,
194                padding,
195            } => {
196                let ref_size = 4 + offset_size as usize + 4;
197                let n = self.num_elements() as usize;
198                let mut result = Vec::with_capacity(n);
199                for i in 0..n {
200                    let offset = i * ref_size;
201                    if offset + ref_size > self.raw_data.len() {
202                        break;
203                    }
204                    result.push(read_one_vlen_string(
205                        &self.raw_data,
206                        offset,
207                        file_data,
208                        offset_size,
209                        *padding,
210                        *encoding,
211                    )?);
212                }
213                Ok(result)
214            }
215            Datatype::String {
216                size: StringSize::Fixed(_),
217                ..
218            } => self.read_strings(),
219            _ => Err(Error::TypeMismatch {
220                expected: "String array".into(),
221                actual: format!("{:?}", self.datatype),
222            }),
223        }
224    }
225
226    /// Read the attribute as a vector of strings.
227    pub fn read_strings(&self) -> Result<Vec<String>> {
228        match &self.datatype {
229            Datatype::String {
230                size: StringSize::Fixed(len),
231                encoding,
232                padding,
233            } => {
234                let len = *len as usize;
235                let n = self.num_elements() as usize;
236                let mut result = Vec::with_capacity(n);
237                for i in 0..n {
238                    let start = i * len;
239                    let end = (start + len).min(self.raw_data.len());
240                    if start >= self.raw_data.len() {
241                        break;
242                    }
243                    result.push(decode_string(
244                        &self.raw_data[start..end],
245                        *padding,
246                        *encoding,
247                    )?);
248                }
249                Ok(result)
250            }
251            _ => Err(Error::TypeMismatch {
252                expected: "String array".into(),
253                actual: format!("{:?}", self.datatype),
254            }),
255        }
256    }
257
258    /// Read an attribute as f64 (with automatic promotion from int types).
259    pub fn read_as_f64(&self) -> Result<f64> {
260        match &self.datatype {
261            Datatype::FloatingPoint { size, .. } => {
262                let val: f64 = match size {
263                    4 => {
264                        let v = self.read_scalar::<f32>()?;
265                        v as f64
266                    }
267                    8 => self.read_scalar::<f64>()?,
268                    _ => {
269                        return Err(Error::TypeMismatch {
270                            expected: "f32 or f64".into(),
271                            actual: format!("FloatingPoint(size={})", size),
272                        })
273                    }
274                };
275                Ok(val)
276            }
277            Datatype::FixedPoint { size, signed, .. } => {
278                let val = match (size, signed) {
279                    (1, true) => self.read_scalar::<i8>()? as f64,
280                    (1, false) => self.read_scalar::<u8>()? as f64,
281                    (2, true) => self.read_scalar::<i16>()? as f64,
282                    (2, false) => self.read_scalar::<u16>()? as f64,
283                    (4, true) => self.read_scalar::<i32>()? as f64,
284                    (4, false) => self.read_scalar::<u32>()? as f64,
285                    (8, true) => self.read_scalar::<i64>()? as f64,
286                    (8, false) => self.read_scalar::<u64>()? as f64,
287                    _ => {
288                        return Err(Error::TypeMismatch {
289                            expected: "numeric".into(),
290                            actual: format!("FixedPoint(size={})", size),
291                        })
292                    }
293                };
294                Ok(val)
295            }
296            _ => Err(Error::TypeMismatch {
297                expected: "numeric".into(),
298                actual: format!("{:?}", self.datatype),
299            }),
300        }
301    }
302}
303
304pub(crate) fn collect_attribute_messages(
305    header: &ObjectHeader,
306    file_data: &[u8],
307    offset_size: u8,
308    length_size: u8,
309) -> Result<Vec<AttributeMessage>> {
310    let mut attributes = Vec::new();
311    let mut attribute_info = None;
312
313    for msg in &header.messages {
314        match msg {
315            HdfMessage::Attribute(attr) => attributes.push(attr.clone()),
316            HdfMessage::AttributeInfo(info) => attribute_info = Some(info.clone()),
317            _ => {}
318        }
319    }
320
321    if let Some(info) = attribute_info {
322        attributes.extend(load_dense_attribute_messages(
323            &info,
324            file_data,
325            offset_size,
326            length_size,
327        )?);
328    }
329
330    Ok(attributes)
331}
332
333fn load_dense_attribute_messages(
334    info: &AttributeInfoMessage,
335    file_data: &[u8],
336    offset_size: u8,
337    length_size: u8,
338) -> Result<Vec<AttributeMessage>> {
339    if Cursor::is_undefined_offset(info.fractal_heap_address, offset_size) {
340        return Ok(Vec::new());
341    }
342
343    let mut heap_cursor = Cursor::new(file_data);
344    heap_cursor.set_position(info.fractal_heap_address);
345    let heap = FractalHeap::parse(&mut heap_cursor, offset_size, length_size)?;
346
347    let records =
348        load_dense_attribute_records(info, file_data, offset_size, length_size).unwrap_or_default();
349
350    let mut attributes = Vec::new();
351    for record in records {
352        let heap_id = match record {
353            btree_v2::BTreeV2Record::AttributeNameHash { heap_id, .. }
354            | btree_v2::BTreeV2Record::AttributeCreationOrder { heap_id, .. } => heap_id,
355            _ => continue,
356        };
357
358        let managed_bytes =
359            match heap.get_managed_object(&heap_id, file_data, offset_size, length_size) {
360                Ok(bytes) => bytes,
361                Err(_) => continue,
362            };
363
364        let mut attr_cursor = Cursor::new(&managed_bytes);
365        if let Ok(attr) = messages::attribute::parse(
366            &mut attr_cursor,
367            offset_size,
368            length_size,
369            managed_bytes.len(),
370        ) {
371            attributes.push(attr);
372        }
373    }
374
375    Ok(attributes)
376}
377
378fn load_dense_attribute_records(
379    info: &AttributeInfoMessage,
380    file_data: &[u8],
381    offset_size: u8,
382    length_size: u8,
383) -> Result<Vec<btree_v2::BTreeV2Record>> {
384    let mut addrs = vec![info.btree_name_index_address];
385    if let Some(creation_order_addr) = info.btree_creation_order_address {
386        addrs.push(creation_order_addr);
387    }
388
389    for addr in addrs {
390        if Cursor::is_undefined_offset(addr, offset_size) {
391            continue;
392        }
393
394        let mut btree_cursor = Cursor::new(file_data);
395        btree_cursor.set_position(addr);
396        let header =
397            match btree_v2::BTreeV2Header::parse(&mut btree_cursor, offset_size, length_size) {
398                Ok(header) => header,
399                Err(_) => continue,
400            };
401
402        if let Ok(records) = btree_v2::collect_btree_v2_records(
403            file_data,
404            &header,
405            offset_size,
406            length_size,
407            None,
408            &[],
409            None,
410        ) {
411            return Ok(records);
412        }
413    }
414
415    Ok(Vec::new())
416}
417
418/// Read one variable-length string from a vlen reference in raw_data.
419pub(crate) fn read_one_vlen_string(
420    raw_data: &[u8],
421    offset: usize,
422    file_data: &[u8],
423    offset_size: u8,
424    padding: StringPadding,
425    encoding: StringEncoding,
426) -> Result<String> {
427    let mut cursor = Cursor::new(&raw_data[offset..]);
428    let _seq_len = cursor.read_u32_le()?;
429    let heap_addr = cursor.read_offset(offset_size)?;
430    let obj_index = cursor.read_u32_le()?;
431
432    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
433        return Ok(String::new());
434    }
435
436    let mut heap_cursor = Cursor::new(file_data);
437    heap_cursor.set_position(heap_addr);
438    let collection = GlobalHeapCollection::parse(&mut heap_cursor, offset_size, offset_size)?;
439
440    match collection.get_object(obj_index as u16) {
441        Some(obj) => decode_string(&obj.data, padding, encoding),
442        None => Ok(String::new()),
443    }
444}
445
446/// Decode a byte slice into a String, handling padding and encoding.
447///
448/// HDF5 supports ASCII and UTF-8 string encodings. Both are valid UTF-8
449/// (ASCII is a strict subset), so we decode uniformly via `from_utf8`.
450pub(crate) fn decode_string(
451    bytes: &[u8],
452    padding: StringPadding,
453    _encoding: StringEncoding,
454) -> Result<String> {
455    let trimmed = match padding {
456        StringPadding::NullTerminate => {
457            let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
458            &bytes[..end]
459        }
460        StringPadding::NullPad => {
461            let end = bytes.iter().rposition(|&b| b != 0).map_or(0, |i| i + 1);
462            &bytes[..end]
463        }
464        StringPadding::SpacePad => {
465            let end = bytes.iter().rposition(|&b| b != b' ').map_or(0, |i| i + 1);
466            &bytes[..end]
467        }
468    };
469
470    String::from_utf8(trimmed.to_vec())
471        .map_err(|e| Error::InvalidData(format!("invalid string data: {e}")))
472}
473
474fn is_byte_vlen(base: &Datatype) -> bool {
475    matches!(base, Datatype::FixedPoint { size: 1, .. })
476}
477
478pub(crate) fn decode_varlen_byte_string(bytes: &[u8]) -> Result<String> {
479    let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
480    String::from_utf8(bytes[..end].to_vec())
481        .map_err(|e| Error::InvalidData(format!("invalid string data: {e}")))
482}
483
484pub(crate) fn resolve_vlen_bytes(
485    raw_data: &[u8],
486    file_data: &[u8],
487    offset_size: u8,
488) -> Option<Vec<u8>> {
489    if raw_data.len() < 4 + offset_size as usize + 4 {
490        return None;
491    }
492
493    let mut cursor = Cursor::new(raw_data);
494    let seq_len = cursor.read_u32_le().ok()? as usize;
495    let heap_addr = cursor.read_offset(offset_size).ok()?;
496    let obj_index = cursor.read_u32_le().ok()? as u16;
497
498    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
499        return Some(Vec::new());
500    }
501
502    let mut heap_cursor = Cursor::new(file_data);
503    heap_cursor.set_position(heap_addr);
504    let collection =
505        GlobalHeapCollection::parse(&mut heap_cursor, offset_size, offset_size).ok()?;
506    let object = collection.get_object(obj_index)?;
507    Some(object.data[..object.data.len().min(seq_len)].to_vec())
508}
509
510#[cfg(test)]
511mod tests {
512    use super::*;
513    use crate::error::ByteOrder;
514    use std::f64::consts::PI;
515
516    #[test]
517    fn test_scalar_f64_attribute() {
518        let value: f64 = PI;
519        let raw_data = value.to_le_bytes().to_vec();
520        let attr = Attribute {
521            name: "pi".to_string(),
522            datatype: Datatype::FloatingPoint {
523                size: 8,
524                byte_order: ByteOrder::LittleEndian,
525            },
526            shape: vec![],
527            raw_data,
528        };
529        let val = attr.read_scalar::<f64>().unwrap();
530        assert!((val - PI).abs() < 1e-10);
531    }
532
533    #[test]
534    fn test_1d_i32_attribute() {
535        let values = [1i32, 2, 3, 4];
536        let mut raw_data = Vec::new();
537        for v in &values {
538            raw_data.extend_from_slice(&v.to_le_bytes());
539        }
540        let attr = Attribute {
541            name: "data".to_string(),
542            datatype: Datatype::FixedPoint {
543                size: 4,
544                signed: true,
545                byte_order: ByteOrder::LittleEndian,
546            },
547            shape: vec![4],
548            raw_data,
549        };
550        let result = attr.read_1d::<i32>().unwrap();
551        assert_eq!(result, vec![1, 2, 3, 4]);
552    }
553
554    #[test]
555    fn test_string_attribute() {
556        let attr = Attribute {
557            name: "units".to_string(),
558            datatype: Datatype::String {
559                size: StringSize::Fixed(10),
560                encoding: StringEncoding::Ascii,
561                padding: StringPadding::NullPad,
562            },
563            shape: vec![],
564            raw_data: b"meters\0\0\0\0".to_vec(),
565        };
566        assert_eq!(attr.read_string().unwrap(), "meters");
567    }
568
569    #[test]
570    fn test_varlen_byte_string_attribute() {
571        let attr = Attribute {
572            name: "name".to_string(),
573            datatype: Datatype::VarLen {
574                base: Box::new(Datatype::FixedPoint {
575                    size: 1,
576                    signed: false,
577                    byte_order: ByteOrder::LittleEndian,
578                }),
579            },
580            shape: vec![],
581            raw_data: b"test_dataset".to_vec(),
582        };
583        assert_eq!(attr.read_string().unwrap(), "test_dataset");
584    }
585
586    #[test]
587    fn test_read_as_f64_from_int() {
588        let raw_data = 42i32.to_le_bytes().to_vec();
589        let attr = Attribute {
590            name: "count".to_string(),
591            datatype: Datatype::FixedPoint {
592                size: 4,
593                signed: true,
594                byte_order: ByteOrder::LittleEndian,
595            },
596            shape: vec![],
597            raw_data,
598        };
599        let val = attr.read_as_f64().unwrap();
600        assert!((val - 42.0).abs() < 1e-10);
601    }
602}