Skip to main content

hdf5_reader/
attribute_api.rs

1use crate::error::{Error, Result};
2use crate::filters::FilterRegistry;
3use crate::fractal_heap::FractalHeap;
4use crate::global_heap::GlobalHeapCollection;
5use crate::io::Cursor;
6use crate::messages::attribute::AttributeMessage;
7use crate::messages::attribute_info::AttributeInfoMessage;
8use crate::messages::dataspace::DataspaceType;
9use crate::messages::datatype::{Datatype, StringEncoding, StringPadding, StringSize, VarLenKind};
10use crate::messages::HdfMessage;
11use crate::object_header::ObjectHeader;
12use crate::storage::Storage;
13use crate::{btree_v2, messages};
14
15fn checked_usize(value: u64, context: &str) -> Result<usize> {
16    usize::try_from(value).map_err(|_| {
17        Error::InvalidData(format!(
18            "{context} value {value} exceeds platform usize capacity"
19        ))
20    })
21}
22
23/// A parsed, high-level HDF5 attribute.
24#[derive(Debug, Clone)]
25pub struct Attribute {
26    pub name: String,
27    pub datatype: Datatype,
28    pub shape: Vec<u64>,
29    pub raw_data: Vec<u8>,
30}
31
32impl Attribute {
33    /// Create from a parsed attribute message.
34    pub fn from_message(msg: AttributeMessage) -> Self {
35        Self::from_message_with_context(msg, None, 0)
36    }
37
38    /// Create from a parsed attribute message with optional file context for
39    /// resolving variable-length byte attributes stored in the global heap.
40    pub fn from_message_with_context(
41        msg: AttributeMessage,
42        file_data: Option<&[u8]>,
43        offset_size: u8,
44    ) -> Self {
45        let shape = match msg.dataspace.dataspace_type {
46            DataspaceType::Scalar => vec![],
47            DataspaceType::Null => vec![0],
48            DataspaceType::Simple => msg.dataspace.dims.clone(),
49        };
50        let raw_data = if let (Some(file_data), Datatype::VarLen { base, kind, .. }) =
51            (file_data, &msg.datatype)
52        {
53            if *kind == VarLenKind::String && is_byte_vlen(base) && shape.is_empty() {
54                resolve_vlen_bytes(&msg.raw_data, file_data, offset_size)
55                    .unwrap_or_else(|| msg.raw_data.clone())
56            } else {
57                msg.raw_data.clone()
58            }
59        } else {
60            msg.raw_data.clone()
61        };
62        Attribute {
63            name: msg.name,
64            datatype: msg.datatype,
65            shape,
66            raw_data,
67        }
68    }
69
70    /// Total number of elements.
71    pub fn num_elements(&self) -> Result<u64> {
72        if self.shape.is_empty() {
73            Ok(1) // scalar
74        } else {
75            self.shape.iter().try_fold(1u64, |acc, &dim| {
76                acc.checked_mul(dim).ok_or_else(|| {
77                    Error::InvalidData("attribute element count overflows u64".to_string())
78                })
79            })
80        }
81    }
82
83    /// Read the attribute value as a scalar of the given type.
84    pub fn read_scalar<T: crate::datatype_api::H5Type>(&self) -> Result<T> {
85        T::from_bytes(&self.raw_data, &self.datatype)
86    }
87
88    /// Read the attribute as a 1-D vector of the given type.
89    pub fn read_1d<T: crate::datatype_api::H5Type>(&self) -> Result<Vec<T>> {
90        let elem_size = T::element_size(&self.datatype);
91        let n = checked_usize(self.num_elements()?, "attribute element count")?;
92        let mut result = Vec::with_capacity(n);
93        for i in 0..n {
94            let start = i * elem_size;
95            let end = start + elem_size;
96            if end > self.raw_data.len() {
97                return Err(Error::InvalidData(format!(
98                    "attribute data too short: need {} bytes, have {}",
99                    end,
100                    self.raw_data.len()
101                )));
102            }
103            result.push(T::from_bytes(&self.raw_data[start..end], &self.datatype)?);
104        }
105        Ok(result)
106    }
107
108    /// Read the attribute as a string (for string-typed attributes).
109    ///
110    /// For variable-length strings, use `read_vlen_string()` with the file data
111    /// and offset_size — this method will return an error directing you there.
112    pub fn read_string(&self) -> Result<String> {
113        match &self.datatype {
114            Datatype::VarLen {
115                base,
116                kind: VarLenKind::String,
117                ..
118            } if is_byte_vlen(base) => decode_varlen_byte_string(&self.raw_data),
119            Datatype::String {
120                size,
121                encoding,
122                padding,
123            } => match size {
124                StringSize::Fixed(len) => {
125                    let len = *len as usize;
126                    let bytes = if self.raw_data.len() < len {
127                        &self.raw_data
128                    } else {
129                        &self.raw_data[..len]
130                    };
131                    decode_string(bytes, *padding, *encoding)
132                }
133                StringSize::Variable => {
134                    // For inline vlen strings in attributes, try direct decode.
135                    // If it looks like a global heap reference (>= 12 bytes for
136                    // seq_len + addr + index), suggest read_vlen_string instead.
137                    if self.raw_data.len() >= 12 {
138                        // Try to decode directly first — some files inline the string
139                        let trimmed = match padding {
140                            StringPadding::NullTerminate => {
141                                let end = self
142                                    .raw_data
143                                    .iter()
144                                    .position(|&b| b == 0)
145                                    .unwrap_or(self.raw_data.len());
146                                &self.raw_data[..end]
147                            }
148                            _ => &self.raw_data,
149                        };
150                        if let Ok(s) = String::from_utf8(trimmed.to_vec()) {
151                            if s.chars()
152                                .all(|c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
153                            {
154                                return Ok(s);
155                            }
156                        }
157                    }
158                    decode_string(&self.raw_data, *padding, *encoding)
159                }
160            },
161            _ => Err(Error::TypeMismatch {
162                expected: "String".into(),
163                actual: format!("{:?}", self.datatype),
164            }),
165        }
166    }
167
168    /// Read a variable-length string attribute from the global heap.
169    ///
170    /// Variable-length strings in HDF5 are stored as references into a global
171    /// heap collection. Each reference is: `seq_len(u32) + heap_addr(offset_size) + index(u32)`.
172    pub fn read_vlen_string(&self, file_data: &[u8], offset_size: u8) -> Result<String> {
173        match &self.datatype {
174            Datatype::String {
175                size: StringSize::Variable,
176                encoding,
177                padding,
178            } => {
179                let ref_size = 4 + offset_size as usize + 4; // seq_len + addr + index
180                if self.raw_data.len() < ref_size {
181                    // Fallback: try direct decode
182                    return decode_string(&self.raw_data, *padding, *encoding);
183                }
184                let bytes = read_one_vlen_string(
185                    &self.raw_data,
186                    0,
187                    file_data,
188                    offset_size,
189                    *padding,
190                    *encoding,
191                )?;
192                Ok(bytes)
193            }
194            Datatype::String {
195                size: StringSize::Fixed(_),
196                ..
197            } => self.read_string(),
198            _ => Err(Error::TypeMismatch {
199                expected: "String".into(),
200                actual: format!("{:?}", self.datatype),
201            }),
202        }
203    }
204
205    /// Read an array of variable-length strings from the global heap.
206    pub fn read_vlen_strings(&self, file_data: &[u8], offset_size: u8) -> Result<Vec<String>> {
207        match &self.datatype {
208            Datatype::String {
209                size: StringSize::Variable,
210                encoding,
211                padding,
212            } => {
213                let ref_size = 4 + offset_size as usize + 4;
214                let n = checked_usize(self.num_elements()?, "attribute string element count")?;
215                let mut result = Vec::with_capacity(n);
216                for i in 0..n {
217                    let offset = i * ref_size;
218                    if offset + ref_size > self.raw_data.len() {
219                        break;
220                    }
221                    result.push(read_one_vlen_string(
222                        &self.raw_data,
223                        offset,
224                        file_data,
225                        offset_size,
226                        *padding,
227                        *encoding,
228                    )?);
229                }
230                Ok(result)
231            }
232            Datatype::String {
233                size: StringSize::Fixed(_),
234                ..
235            } => self.read_strings(),
236            _ => Err(Error::TypeMismatch {
237                expected: "String array".into(),
238                actual: format!("{:?}", self.datatype),
239            }),
240        }
241    }
242
243    /// Read the attribute as a vector of strings.
244    pub fn read_strings(&self) -> Result<Vec<String>> {
245        match &self.datatype {
246            Datatype::String {
247                size: StringSize::Fixed(len),
248                encoding,
249                padding,
250            } => {
251                let len = *len as usize;
252                let n = checked_usize(self.num_elements()?, "attribute string element count")?;
253                let mut result = Vec::with_capacity(n);
254                for i in 0..n {
255                    let start = i * len;
256                    let end = (start + len).min(self.raw_data.len());
257                    if start >= self.raw_data.len() {
258                        break;
259                    }
260                    result.push(decode_string(
261                        &self.raw_data[start..end],
262                        *padding,
263                        *encoding,
264                    )?);
265                }
266                Ok(result)
267            }
268            _ => Err(Error::TypeMismatch {
269                expected: "String array".into(),
270                actual: format!("{:?}", self.datatype),
271            }),
272        }
273    }
274
275    /// Read an attribute as f64 (with automatic promotion from int types).
276    pub fn read_as_f64(&self) -> Result<f64> {
277        match &self.datatype {
278            Datatype::FloatingPoint { size, .. } => {
279                let val: f64 = match size {
280                    4 => {
281                        let v = self.read_scalar::<f32>()?;
282                        v as f64
283                    }
284                    8 => self.read_scalar::<f64>()?,
285                    _ => {
286                        return Err(Error::TypeMismatch {
287                            expected: "f32 or f64".into(),
288                            actual: format!("FloatingPoint(size={})", size),
289                        })
290                    }
291                };
292                Ok(val)
293            }
294            Datatype::FixedPoint { size, signed, .. } => {
295                let val = match (size, signed) {
296                    (1, true) => self.read_scalar::<i8>()? as f64,
297                    (1, false) => self.read_scalar::<u8>()? as f64,
298                    (2, true) => self.read_scalar::<i16>()? as f64,
299                    (2, false) => self.read_scalar::<u16>()? as f64,
300                    (4, true) => self.read_scalar::<i32>()? as f64,
301                    (4, false) => self.read_scalar::<u32>()? as f64,
302                    (8, true) => self.read_scalar::<i64>()? as f64,
303                    (8, false) => self.read_scalar::<u64>()? as f64,
304                    _ => {
305                        return Err(Error::TypeMismatch {
306                            expected: "numeric".into(),
307                            actual: format!("FixedPoint(size={})", size),
308                        })
309                    }
310                };
311                Ok(val)
312            }
313            _ => Err(Error::TypeMismatch {
314                expected: "numeric".into(),
315                actual: format!("{:?}", self.datatype),
316            }),
317        }
318    }
319}
320
321pub(crate) fn collect_attribute_messages_storage(
322    header: &ObjectHeader,
323    storage: &dyn Storage,
324    offset_size: u8,
325    length_size: u8,
326    filter_registry: Option<&FilterRegistry>,
327) -> Result<Vec<AttributeMessage>> {
328    let mut attributes = Vec::new();
329    let mut attribute_info = None;
330
331    for msg in &header.messages {
332        match msg {
333            HdfMessage::Attribute(attr) => attributes.push(attr.clone()),
334            HdfMessage::AttributeInfo(info) => attribute_info = Some(info.clone()),
335            _ => {}
336        }
337    }
338
339    if let Some(info) = attribute_info {
340        attributes.extend(load_dense_attribute_messages_storage(
341            &info,
342            storage,
343            offset_size,
344            length_size,
345            filter_registry,
346        )?);
347    }
348
349    Ok(attributes)
350}
351
352fn load_dense_attribute_messages_storage(
353    info: &AttributeInfoMessage,
354    storage: &dyn Storage,
355    offset_size: u8,
356    length_size: u8,
357    filter_registry: Option<&FilterRegistry>,
358) -> Result<Vec<AttributeMessage>> {
359    if Cursor::is_undefined_offset(info.fractal_heap_address, offset_size) {
360        return Ok(Vec::new());
361    }
362
363    let heap = FractalHeap::parse_at_storage(
364        storage,
365        info.fractal_heap_address,
366        offset_size,
367        length_size,
368    )?;
369
370    let records = load_dense_attribute_records_storage(info, storage, offset_size, length_size)?;
371
372    let mut attributes = Vec::new();
373    for record in records {
374        let heap_id = match record {
375            btree_v2::BTreeV2Record::AttributeNameHash { heap_id, .. }
376            | btree_v2::BTreeV2Record::AttributeCreationOrder { heap_id, .. } => heap_id,
377            _ => continue,
378        };
379
380        let managed_bytes = heap.get_object_storage_with_registry(
381            &heap_id,
382            storage,
383            offset_size,
384            length_size,
385            filter_registry,
386        )?;
387
388        let mut attr_cursor = Cursor::new(&managed_bytes);
389        let attr = messages::attribute::parse(
390            &mut attr_cursor,
391            offset_size,
392            length_size,
393            managed_bytes.len(),
394        )?;
395        attributes.push(attr);
396    }
397
398    Ok(attributes)
399}
400
401fn load_dense_attribute_records_storage(
402    info: &AttributeInfoMessage,
403    storage: &dyn Storage,
404    offset_size: u8,
405    length_size: u8,
406) -> Result<Vec<btree_v2::BTreeV2Record>> {
407    let mut addrs = vec![("name", info.btree_name_index_address)];
408    if let Some(creation_order_addr) = info.btree_creation_order_address {
409        addrs.push(("creation-order", creation_order_addr));
410    }
411
412    let mut last_error = None;
413    for (index_name, addr) in addrs {
414        if Cursor::is_undefined_offset(addr, offset_size) {
415            continue;
416        }
417
418        let header = match btree_v2::BTreeV2Header::parse_at_storage(
419            storage,
420            addr,
421            offset_size,
422            length_size,
423        ) {
424            Ok(header) => header,
425            Err(err) => {
426                last_error = Some(format!(
427                    "failed to parse dense attribute {index_name} B-tree at {addr:#x}: {err}"
428                ));
429                continue;
430            }
431        };
432
433        match btree_v2::collect_btree_v2_records_storage(
434            storage,
435            &header,
436            offset_size,
437            length_size,
438            None,
439            &[],
440            None,
441        ) {
442            Ok(records) => return Ok(records),
443            Err(err) => {
444                last_error = Some(format!(
445                    "failed to read dense attribute {index_name} B-tree at {addr:#x}: {err}"
446                ));
447            }
448        }
449    }
450
451    if let Some(err) = last_error {
452        Err(Error::InvalidData(format!(
453            "failed to load dense attribute records: {err}"
454        )))
455    } else {
456        Ok(Vec::new())
457    }
458}
459
460/// Read one variable-length string from a vlen reference in raw_data.
461pub(crate) fn read_one_vlen_string(
462    raw_data: &[u8],
463    offset: usize,
464    file_data: &[u8],
465    offset_size: u8,
466    padding: StringPadding,
467    encoding: StringEncoding,
468) -> Result<String> {
469    let mut cursor = Cursor::new(&raw_data[offset..]);
470    let _seq_len = cursor.read_u32_le()?;
471    let heap_addr = cursor.read_offset(offset_size)?;
472    let obj_index = cursor.read_u32_le()?;
473
474    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
475        return Ok(String::new());
476    }
477
478    let mut heap_cursor = Cursor::new(file_data);
479    heap_cursor.set_position(heap_addr);
480    let collection = GlobalHeapCollection::parse(&mut heap_cursor, offset_size, offset_size)?;
481
482    match collection.get_object(obj_index as u16) {
483        Some(obj) => decode_string(&obj.data, padding, encoding),
484        None => Ok(String::new()),
485    }
486}
487
488pub(crate) fn read_one_vlen_string_storage(
489    raw_data: &[u8],
490    offset: usize,
491    storage: &dyn Storage,
492    offset_size: u8,
493    length_size: u8,
494    padding: StringPadding,
495    encoding: StringEncoding,
496) -> Result<String> {
497    let mut cursor = Cursor::new(&raw_data[offset..]);
498    let _seq_len = cursor.read_u32_le()?;
499    let heap_addr = cursor.read_offset(offset_size)?;
500    let obj_index = cursor.read_u32_le()?;
501
502    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
503        return Ok(String::new());
504    }
505
506    let collection =
507        GlobalHeapCollection::parse_at_storage(storage, heap_addr, offset_size, length_size)?;
508    match collection.get_object(obj_index as u16) {
509        Some(obj) => decode_string(&obj.data, padding, encoding),
510        None => Ok(String::new()),
511    }
512}
513
514/// Decode a byte slice into a String, handling padding and encoding.
515///
516/// HDF5 supports ASCII and UTF-8 string encodings. Both are valid UTF-8
517/// (ASCII is a strict subset), so we decode uniformly via `from_utf8`.
518pub(crate) fn decode_string(
519    bytes: &[u8],
520    padding: StringPadding,
521    _encoding: StringEncoding,
522) -> Result<String> {
523    let trimmed = match padding {
524        StringPadding::NullTerminate => {
525            let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
526            &bytes[..end]
527        }
528        StringPadding::NullPad => {
529            let end = bytes.iter().rposition(|&b| b != 0).map_or(0, |i| i + 1);
530            &bytes[..end]
531        }
532        StringPadding::SpacePad => {
533            let end = bytes.iter().rposition(|&b| b != b' ').map_or(0, |i| i + 1);
534            &bytes[..end]
535        }
536    };
537
538    String::from_utf8(trimmed.to_vec())
539        .map_err(|e| Error::InvalidData(format!("invalid string data: {e}")))
540}
541
542fn is_byte_vlen(base: &Datatype) -> bool {
543    matches!(base, Datatype::FixedPoint { size: 1, .. })
544}
545
546pub(crate) fn decode_varlen_byte_string(bytes: &[u8]) -> Result<String> {
547    let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
548    String::from_utf8(bytes[..end].to_vec())
549        .map_err(|e| Error::InvalidData(format!("invalid string data: {e}")))
550}
551
552pub(crate) fn resolve_vlen_bytes(
553    raw_data: &[u8],
554    file_data: &[u8],
555    offset_size: u8,
556) -> Option<Vec<u8>> {
557    if raw_data.len() < 4 + offset_size as usize + 4 {
558        return None;
559    }
560
561    let mut cursor = Cursor::new(raw_data);
562    let seq_len = cursor.read_u32_le().ok()? as usize;
563    let heap_addr = cursor.read_offset(offset_size).ok()?;
564    let obj_index = cursor.read_u32_le().ok()? as u16;
565
566    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
567        return Some(Vec::new());
568    }
569
570    let mut heap_cursor = Cursor::new(file_data);
571    heap_cursor.set_position(heap_addr);
572    let collection =
573        GlobalHeapCollection::parse(&mut heap_cursor, offset_size, offset_size).ok()?;
574    let object = collection.get_object(obj_index)?;
575    Some(object.data[..object.data.len().min(seq_len)].to_vec())
576}
577
578pub(crate) fn resolve_vlen_bytes_storage(
579    raw_data: &[u8],
580    storage: &dyn Storage,
581    offset_size: u8,
582    length_size: u8,
583) -> Option<Vec<u8>> {
584    if raw_data.len() < 4 + offset_size as usize + 4 {
585        return None;
586    }
587
588    let mut cursor = Cursor::new(raw_data);
589    let seq_len = cursor.read_u32_le().ok()? as usize;
590    let heap_addr = cursor.read_offset(offset_size).ok()?;
591    let obj_index = cursor.read_u32_le().ok()? as u16;
592
593    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
594        return Some(Vec::new());
595    }
596
597    let collection =
598        GlobalHeapCollection::parse_at_storage(storage, heap_addr, offset_size, length_size)
599            .ok()?;
600    let object = collection.get_object(obj_index)?;
601    Some(object.data[..object.data.len().min(seq_len)].to_vec())
602}
603
604#[cfg(test)]
605mod tests {
606    use super::*;
607    use crate::error::ByteOrder;
608    use crate::storage::BytesStorage;
609    use std::f64::consts::PI;
610
611    #[test]
612    fn scalar_f64_attribute() {
613        let value: f64 = PI;
614        let raw_data = value.to_le_bytes().to_vec();
615        let attr = Attribute {
616            name: "pi".to_string(),
617            datatype: Datatype::FloatingPoint {
618                size: 8,
619                byte_order: ByteOrder::LittleEndian,
620            },
621            shape: vec![],
622            raw_data,
623        };
624        let val = attr.read_scalar::<f64>().unwrap();
625        assert!((val - PI).abs() < 1e-10);
626    }
627
628    #[test]
629    fn one_dimensional_i32_attribute() {
630        let values = [1i32, 2, 3, 4];
631        let mut raw_data = Vec::new();
632        for v in &values {
633            raw_data.extend_from_slice(&v.to_le_bytes());
634        }
635        let attr = Attribute {
636            name: "data".to_string(),
637            datatype: Datatype::FixedPoint {
638                size: 4,
639                signed: true,
640                byte_order: ByteOrder::LittleEndian,
641            },
642            shape: vec![4],
643            raw_data,
644        };
645        let result = attr.read_1d::<i32>().unwrap();
646        assert_eq!(result, vec![1, 2, 3, 4]);
647    }
648
649    #[test]
650    fn string_attribute() {
651        let attr = Attribute {
652            name: "units".to_string(),
653            datatype: Datatype::String {
654                size: StringSize::Fixed(10),
655                encoding: StringEncoding::Ascii,
656                padding: StringPadding::NullPad,
657            },
658            shape: vec![],
659            raw_data: b"meters\0\0\0\0".to_vec(),
660        };
661        assert_eq!(attr.read_string().unwrap(), "meters");
662    }
663
664    #[test]
665    fn varlen_byte_string_attribute() {
666        let attr = Attribute {
667            name: "name".to_string(),
668            datatype: Datatype::VarLen {
669                base: Box::new(Datatype::FixedPoint {
670                    size: 1,
671                    signed: false,
672                    byte_order: ByteOrder::LittleEndian,
673                }),
674                kind: VarLenKind::String,
675                encoding: StringEncoding::Utf8,
676                padding: StringPadding::NullTerminate,
677            },
678            shape: vec![],
679            raw_data: b"test_dataset".to_vec(),
680        };
681        assert_eq!(attr.read_string().unwrap(), "test_dataset");
682    }
683
684    #[test]
685    fn read_as_f64_from_int() {
686        let raw_data = 42i32.to_le_bytes().to_vec();
687        let attr = Attribute {
688            name: "count".to_string(),
689            datatype: Datatype::FixedPoint {
690                size: 4,
691                signed: true,
692                byte_order: ByteOrder::LittleEndian,
693            },
694            shape: vec![],
695            raw_data,
696        };
697        let val = attr.read_as_f64().unwrap();
698        assert!((val - 42.0).abs() < 1e-10);
699    }
700
701    #[test]
702    fn dense_attribute_btree_errors_surface() {
703        let info = AttributeInfoMessage {
704            creation_order_tracked: false,
705            creation_order_indexed: false,
706            max_creation_index: None,
707            fractal_heap_address: 0,
708            btree_name_index_address: 0,
709            btree_creation_order_address: None,
710        };
711        let storage = BytesStorage::new(Vec::new());
712
713        let err = load_dense_attribute_records_storage(&info, &storage, 8, 8).unwrap_err();
714        assert!(matches!(err, Error::InvalidData(_)));
715        assert!(err.to_string().contains("dense attribute"));
716    }
717}