Skip to main content

hdf5_reader/
attribute_api.rs

1use crate::error::{Error, Result};
2use crate::filters::FilterRegistry;
3use crate::fractal_heap::FractalHeap;
4use crate::global_heap::GlobalHeapCollection;
5use crate::io::Cursor;
6use crate::messages::attribute::AttributeMessage;
7use crate::messages::attribute_info::AttributeInfoMessage;
8use crate::messages::dataspace::DataspaceType;
9use crate::messages::datatype::{Datatype, StringEncoding, StringPadding, StringSize, VarLenKind};
10use crate::messages::HdfMessage;
11use crate::object_header::ObjectHeader;
12use crate::storage::Storage;
13use crate::{btree_v2, messages};
14
15/// A parsed, high-level HDF5 attribute.
16#[derive(Debug, Clone)]
17pub struct Attribute {
18    pub name: String,
19    pub datatype: Datatype,
20    pub shape: Vec<u64>,
21    pub raw_data: Vec<u8>,
22}
23
24impl Attribute {
25    /// Create from a parsed attribute message.
26    pub fn from_message(msg: AttributeMessage) -> Self {
27        Self::from_message_with_context(msg, None, 0)
28    }
29
30    /// Create from a parsed attribute message with optional file context for
31    /// resolving variable-length byte attributes stored in the global heap.
32    pub fn from_message_with_context(
33        msg: AttributeMessage,
34        file_data: Option<&[u8]>,
35        offset_size: u8,
36    ) -> Self {
37        let shape = match msg.dataspace.dataspace_type {
38            DataspaceType::Scalar => vec![],
39            DataspaceType::Null => vec![0],
40            DataspaceType::Simple => msg.dataspace.dims.clone(),
41        };
42        let raw_data = if let (Some(file_data), Datatype::VarLen { base, kind, .. }) =
43            (file_data, &msg.datatype)
44        {
45            if *kind == VarLenKind::String && is_byte_vlen(base) && shape.is_empty() {
46                resolve_vlen_bytes(&msg.raw_data, file_data, offset_size)
47                    .unwrap_or_else(|| msg.raw_data.clone())
48            } else {
49                msg.raw_data.clone()
50            }
51        } else {
52            msg.raw_data.clone()
53        };
54        Attribute {
55            name: msg.name,
56            datatype: msg.datatype,
57            shape,
58            raw_data,
59        }
60    }
61
62    /// Total number of elements.
63    pub fn num_elements(&self) -> u64 {
64        if self.shape.is_empty() {
65            1 // scalar
66        } else {
67            self.shape.iter().product()
68        }
69    }
70
71    /// Read the attribute value as a scalar of the given type.
72    pub fn read_scalar<T: crate::datatype_api::H5Type>(&self) -> Result<T> {
73        T::from_bytes(&self.raw_data, &self.datatype)
74    }
75
76    /// Read the attribute as a 1-D vector of the given type.
77    pub fn read_1d<T: crate::datatype_api::H5Type>(&self) -> Result<Vec<T>> {
78        let elem_size = T::element_size(&self.datatype);
79        let n = self.num_elements() as usize;
80        let mut result = Vec::with_capacity(n);
81        for i in 0..n {
82            let start = i * elem_size;
83            let end = start + elem_size;
84            if end > self.raw_data.len() {
85                return Err(Error::InvalidData(format!(
86                    "attribute data too short: need {} bytes, have {}",
87                    end,
88                    self.raw_data.len()
89                )));
90            }
91            result.push(T::from_bytes(&self.raw_data[start..end], &self.datatype)?);
92        }
93        Ok(result)
94    }
95
96    /// Read the attribute as a string (for string-typed attributes).
97    ///
98    /// For variable-length strings, use `read_vlen_string()` with the file data
99    /// and offset_size — this method will return an error directing you there.
100    pub fn read_string(&self) -> Result<String> {
101        match &self.datatype {
102            Datatype::VarLen {
103                base,
104                kind: VarLenKind::String,
105                ..
106            } if is_byte_vlen(base) => decode_varlen_byte_string(&self.raw_data),
107            Datatype::String {
108                size,
109                encoding,
110                padding,
111            } => match size {
112                StringSize::Fixed(len) => {
113                    let len = *len as usize;
114                    let bytes = if self.raw_data.len() < len {
115                        &self.raw_data
116                    } else {
117                        &self.raw_data[..len]
118                    };
119                    decode_string(bytes, *padding, *encoding)
120                }
121                StringSize::Variable => {
122                    // For inline vlen strings in attributes, try direct decode.
123                    // If it looks like a global heap reference (>= 12 bytes for
124                    // seq_len + addr + index), suggest read_vlen_string instead.
125                    if self.raw_data.len() >= 12 {
126                        // Try to decode directly first — some files inline the string
127                        let trimmed = match padding {
128                            StringPadding::NullTerminate => {
129                                let end = self
130                                    .raw_data
131                                    .iter()
132                                    .position(|&b| b == 0)
133                                    .unwrap_or(self.raw_data.len());
134                                &self.raw_data[..end]
135                            }
136                            _ => &self.raw_data,
137                        };
138                        if let Ok(s) = String::from_utf8(trimmed.to_vec()) {
139                            if s.chars()
140                                .all(|c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
141                            {
142                                return Ok(s);
143                            }
144                        }
145                    }
146                    decode_string(&self.raw_data, *padding, *encoding)
147                }
148            },
149            _ => Err(Error::TypeMismatch {
150                expected: "String".into(),
151                actual: format!("{:?}", self.datatype),
152            }),
153        }
154    }
155
156    /// Read a variable-length string attribute from the global heap.
157    ///
158    /// Variable-length strings in HDF5 are stored as references into a global
159    /// heap collection. Each reference is: `seq_len(u32) + heap_addr(offset_size) + index(u32)`.
160    pub fn read_vlen_string(&self, file_data: &[u8], offset_size: u8) -> Result<String> {
161        match &self.datatype {
162            Datatype::String {
163                size: StringSize::Variable,
164                encoding,
165                padding,
166            } => {
167                let ref_size = 4 + offset_size as usize + 4; // seq_len + addr + index
168                if self.raw_data.len() < ref_size {
169                    // Fallback: try direct decode
170                    return decode_string(&self.raw_data, *padding, *encoding);
171                }
172                let bytes = read_one_vlen_string(
173                    &self.raw_data,
174                    0,
175                    file_data,
176                    offset_size,
177                    *padding,
178                    *encoding,
179                )?;
180                Ok(bytes)
181            }
182            Datatype::String {
183                size: StringSize::Fixed(_),
184                ..
185            } => self.read_string(),
186            _ => Err(Error::TypeMismatch {
187                expected: "String".into(),
188                actual: format!("{:?}", self.datatype),
189            }),
190        }
191    }
192
193    /// Read an array of variable-length strings from the global heap.
194    pub fn read_vlen_strings(&self, file_data: &[u8], offset_size: u8) -> Result<Vec<String>> {
195        match &self.datatype {
196            Datatype::String {
197                size: StringSize::Variable,
198                encoding,
199                padding,
200            } => {
201                let ref_size = 4 + offset_size as usize + 4;
202                let n = self.num_elements() as usize;
203                let mut result = Vec::with_capacity(n);
204                for i in 0..n {
205                    let offset = i * ref_size;
206                    if offset + ref_size > self.raw_data.len() {
207                        break;
208                    }
209                    result.push(read_one_vlen_string(
210                        &self.raw_data,
211                        offset,
212                        file_data,
213                        offset_size,
214                        *padding,
215                        *encoding,
216                    )?);
217                }
218                Ok(result)
219            }
220            Datatype::String {
221                size: StringSize::Fixed(_),
222                ..
223            } => self.read_strings(),
224            _ => Err(Error::TypeMismatch {
225                expected: "String array".into(),
226                actual: format!("{:?}", self.datatype),
227            }),
228        }
229    }
230
231    /// Read the attribute as a vector of strings.
232    pub fn read_strings(&self) -> Result<Vec<String>> {
233        match &self.datatype {
234            Datatype::String {
235                size: StringSize::Fixed(len),
236                encoding,
237                padding,
238            } => {
239                let len = *len as usize;
240                let n = self.num_elements() as usize;
241                let mut result = Vec::with_capacity(n);
242                for i in 0..n {
243                    let start = i * len;
244                    let end = (start + len).min(self.raw_data.len());
245                    if start >= self.raw_data.len() {
246                        break;
247                    }
248                    result.push(decode_string(
249                        &self.raw_data[start..end],
250                        *padding,
251                        *encoding,
252                    )?);
253                }
254                Ok(result)
255            }
256            _ => Err(Error::TypeMismatch {
257                expected: "String array".into(),
258                actual: format!("{:?}", self.datatype),
259            }),
260        }
261    }
262
263    /// Read an attribute as f64 (with automatic promotion from int types).
264    pub fn read_as_f64(&self) -> Result<f64> {
265        match &self.datatype {
266            Datatype::FloatingPoint { size, .. } => {
267                let val: f64 = match size {
268                    4 => {
269                        let v = self.read_scalar::<f32>()?;
270                        v as f64
271                    }
272                    8 => self.read_scalar::<f64>()?,
273                    _ => {
274                        return Err(Error::TypeMismatch {
275                            expected: "f32 or f64".into(),
276                            actual: format!("FloatingPoint(size={})", size),
277                        })
278                    }
279                };
280                Ok(val)
281            }
282            Datatype::FixedPoint { size, signed, .. } => {
283                let val = match (size, signed) {
284                    (1, true) => self.read_scalar::<i8>()? as f64,
285                    (1, false) => self.read_scalar::<u8>()? as f64,
286                    (2, true) => self.read_scalar::<i16>()? as f64,
287                    (2, false) => self.read_scalar::<u16>()? as f64,
288                    (4, true) => self.read_scalar::<i32>()? as f64,
289                    (4, false) => self.read_scalar::<u32>()? as f64,
290                    (8, true) => self.read_scalar::<i64>()? as f64,
291                    (8, false) => self.read_scalar::<u64>()? as f64,
292                    _ => {
293                        return Err(Error::TypeMismatch {
294                            expected: "numeric".into(),
295                            actual: format!("FixedPoint(size={})", size),
296                        })
297                    }
298                };
299                Ok(val)
300            }
301            _ => Err(Error::TypeMismatch {
302                expected: "numeric".into(),
303                actual: format!("{:?}", self.datatype),
304            }),
305        }
306    }
307}
308
309pub(crate) fn collect_attribute_messages_storage(
310    header: &ObjectHeader,
311    storage: &dyn Storage,
312    offset_size: u8,
313    length_size: u8,
314    filter_registry: Option<&FilterRegistry>,
315) -> Result<Vec<AttributeMessage>> {
316    let mut attributes = Vec::new();
317    let mut attribute_info = None;
318
319    for msg in &header.messages {
320        match msg {
321            HdfMessage::Attribute(attr) => attributes.push(attr.clone()),
322            HdfMessage::AttributeInfo(info) => attribute_info = Some(info.clone()),
323            _ => {}
324        }
325    }
326
327    if let Some(info) = attribute_info {
328        attributes.extend(load_dense_attribute_messages_storage(
329            &info,
330            storage,
331            offset_size,
332            length_size,
333            filter_registry,
334        )?);
335    }
336
337    Ok(attributes)
338}
339
340fn load_dense_attribute_messages_storage(
341    info: &AttributeInfoMessage,
342    storage: &dyn Storage,
343    offset_size: u8,
344    length_size: u8,
345    filter_registry: Option<&FilterRegistry>,
346) -> Result<Vec<AttributeMessage>> {
347    if Cursor::is_undefined_offset(info.fractal_heap_address, offset_size) {
348        return Ok(Vec::new());
349    }
350
351    let heap = FractalHeap::parse_at_storage(
352        storage,
353        info.fractal_heap_address,
354        offset_size,
355        length_size,
356    )?;
357
358    let records = load_dense_attribute_records_storage(info, storage, offset_size, length_size)?;
359
360    let mut attributes = Vec::new();
361    for record in records {
362        let heap_id = match record {
363            btree_v2::BTreeV2Record::AttributeNameHash { heap_id, .. }
364            | btree_v2::BTreeV2Record::AttributeCreationOrder { heap_id, .. } => heap_id,
365            _ => continue,
366        };
367
368        let managed_bytes = heap.get_object_storage_with_registry(
369            &heap_id,
370            storage,
371            offset_size,
372            length_size,
373            filter_registry,
374        )?;
375
376        let mut attr_cursor = Cursor::new(&managed_bytes);
377        let attr = messages::attribute::parse(
378            &mut attr_cursor,
379            offset_size,
380            length_size,
381            managed_bytes.len(),
382        )?;
383        attributes.push(attr);
384    }
385
386    Ok(attributes)
387}
388
389fn load_dense_attribute_records_storage(
390    info: &AttributeInfoMessage,
391    storage: &dyn Storage,
392    offset_size: u8,
393    length_size: u8,
394) -> Result<Vec<btree_v2::BTreeV2Record>> {
395    let mut addrs = vec![("name", info.btree_name_index_address)];
396    if let Some(creation_order_addr) = info.btree_creation_order_address {
397        addrs.push(("creation-order", creation_order_addr));
398    }
399
400    let mut last_error = None;
401    for (index_name, addr) in addrs {
402        if Cursor::is_undefined_offset(addr, offset_size) {
403            continue;
404        }
405
406        let header = match btree_v2::BTreeV2Header::parse_at_storage(
407            storage,
408            addr,
409            offset_size,
410            length_size,
411        ) {
412            Ok(header) => header,
413            Err(err) => {
414                last_error = Some(format!(
415                    "failed to parse dense attribute {index_name} B-tree at {addr:#x}: {err}"
416                ));
417                continue;
418            }
419        };
420
421        match btree_v2::collect_btree_v2_records_storage(
422            storage,
423            &header,
424            offset_size,
425            length_size,
426            None,
427            &[],
428            None,
429        ) {
430            Ok(records) => return Ok(records),
431            Err(err) => {
432                last_error = Some(format!(
433                    "failed to read dense attribute {index_name} B-tree at {addr:#x}: {err}"
434                ));
435            }
436        }
437    }
438
439    if let Some(err) = last_error {
440        Err(Error::InvalidData(format!(
441            "failed to load dense attribute records: {err}"
442        )))
443    } else {
444        Ok(Vec::new())
445    }
446}
447
448/// Read one variable-length string from a vlen reference in raw_data.
449pub(crate) fn read_one_vlen_string(
450    raw_data: &[u8],
451    offset: usize,
452    file_data: &[u8],
453    offset_size: u8,
454    padding: StringPadding,
455    encoding: StringEncoding,
456) -> Result<String> {
457    let mut cursor = Cursor::new(&raw_data[offset..]);
458    let _seq_len = cursor.read_u32_le()?;
459    let heap_addr = cursor.read_offset(offset_size)?;
460    let obj_index = cursor.read_u32_le()?;
461
462    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
463        return Ok(String::new());
464    }
465
466    let mut heap_cursor = Cursor::new(file_data);
467    heap_cursor.set_position(heap_addr);
468    let collection = GlobalHeapCollection::parse(&mut heap_cursor, offset_size, offset_size)?;
469
470    match collection.get_object(obj_index as u16) {
471        Some(obj) => decode_string(&obj.data, padding, encoding),
472        None => Ok(String::new()),
473    }
474}
475
476pub(crate) fn read_one_vlen_string_storage(
477    raw_data: &[u8],
478    offset: usize,
479    storage: &dyn Storage,
480    offset_size: u8,
481    length_size: u8,
482    padding: StringPadding,
483    encoding: StringEncoding,
484) -> Result<String> {
485    let mut cursor = Cursor::new(&raw_data[offset..]);
486    let _seq_len = cursor.read_u32_le()?;
487    let heap_addr = cursor.read_offset(offset_size)?;
488    let obj_index = cursor.read_u32_le()?;
489
490    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
491        return Ok(String::new());
492    }
493
494    let collection =
495        GlobalHeapCollection::parse_at_storage(storage, heap_addr, offset_size, length_size)?;
496    match collection.get_object(obj_index as u16) {
497        Some(obj) => decode_string(&obj.data, padding, encoding),
498        None => Ok(String::new()),
499    }
500}
501
502/// Decode a byte slice into a String, handling padding and encoding.
503///
504/// HDF5 supports ASCII and UTF-8 string encodings. Both are valid UTF-8
505/// (ASCII is a strict subset), so we decode uniformly via `from_utf8`.
506pub(crate) fn decode_string(
507    bytes: &[u8],
508    padding: StringPadding,
509    _encoding: StringEncoding,
510) -> Result<String> {
511    let trimmed = match padding {
512        StringPadding::NullTerminate => {
513            let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
514            &bytes[..end]
515        }
516        StringPadding::NullPad => {
517            let end = bytes.iter().rposition(|&b| b != 0).map_or(0, |i| i + 1);
518            &bytes[..end]
519        }
520        StringPadding::SpacePad => {
521            let end = bytes.iter().rposition(|&b| b != b' ').map_or(0, |i| i + 1);
522            &bytes[..end]
523        }
524    };
525
526    String::from_utf8(trimmed.to_vec())
527        .map_err(|e| Error::InvalidData(format!("invalid string data: {e}")))
528}
529
530fn is_byte_vlen(base: &Datatype) -> bool {
531    matches!(base, Datatype::FixedPoint { size: 1, .. })
532}
533
534pub(crate) fn decode_varlen_byte_string(bytes: &[u8]) -> Result<String> {
535    let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
536    String::from_utf8(bytes[..end].to_vec())
537        .map_err(|e| Error::InvalidData(format!("invalid string data: {e}")))
538}
539
540pub(crate) fn resolve_vlen_bytes(
541    raw_data: &[u8],
542    file_data: &[u8],
543    offset_size: u8,
544) -> Option<Vec<u8>> {
545    if raw_data.len() < 4 + offset_size as usize + 4 {
546        return None;
547    }
548
549    let mut cursor = Cursor::new(raw_data);
550    let seq_len = cursor.read_u32_le().ok()? as usize;
551    let heap_addr = cursor.read_offset(offset_size).ok()?;
552    let obj_index = cursor.read_u32_le().ok()? as u16;
553
554    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
555        return Some(Vec::new());
556    }
557
558    let mut heap_cursor = Cursor::new(file_data);
559    heap_cursor.set_position(heap_addr);
560    let collection =
561        GlobalHeapCollection::parse(&mut heap_cursor, offset_size, offset_size).ok()?;
562    let object = collection.get_object(obj_index)?;
563    Some(object.data[..object.data.len().min(seq_len)].to_vec())
564}
565
566pub(crate) fn resolve_vlen_bytes_storage(
567    raw_data: &[u8],
568    storage: &dyn Storage,
569    offset_size: u8,
570    length_size: u8,
571) -> Option<Vec<u8>> {
572    if raw_data.len() < 4 + offset_size as usize + 4 {
573        return None;
574    }
575
576    let mut cursor = Cursor::new(raw_data);
577    let seq_len = cursor.read_u32_le().ok()? as usize;
578    let heap_addr = cursor.read_offset(offset_size).ok()?;
579    let obj_index = cursor.read_u32_le().ok()? as u16;
580
581    if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
582        return Some(Vec::new());
583    }
584
585    let collection =
586        GlobalHeapCollection::parse_at_storage(storage, heap_addr, offset_size, length_size)
587            .ok()?;
588    let object = collection.get_object(obj_index)?;
589    Some(object.data[..object.data.len().min(seq_len)].to_vec())
590}
591
592#[cfg(test)]
593mod tests {
594    use super::*;
595    use crate::error::ByteOrder;
596    use crate::storage::BytesStorage;
597    use std::f64::consts::PI;
598
599    #[test]
600    fn test_scalar_f64_attribute() {
601        let value: f64 = PI;
602        let raw_data = value.to_le_bytes().to_vec();
603        let attr = Attribute {
604            name: "pi".to_string(),
605            datatype: Datatype::FloatingPoint {
606                size: 8,
607                byte_order: ByteOrder::LittleEndian,
608            },
609            shape: vec![],
610            raw_data,
611        };
612        let val = attr.read_scalar::<f64>().unwrap();
613        assert!((val - PI).abs() < 1e-10);
614    }
615
616    #[test]
617    fn test_1d_i32_attribute() {
618        let values = [1i32, 2, 3, 4];
619        let mut raw_data = Vec::new();
620        for v in &values {
621            raw_data.extend_from_slice(&v.to_le_bytes());
622        }
623        let attr = Attribute {
624            name: "data".to_string(),
625            datatype: Datatype::FixedPoint {
626                size: 4,
627                signed: true,
628                byte_order: ByteOrder::LittleEndian,
629            },
630            shape: vec![4],
631            raw_data,
632        };
633        let result = attr.read_1d::<i32>().unwrap();
634        assert_eq!(result, vec![1, 2, 3, 4]);
635    }
636
637    #[test]
638    fn test_string_attribute() {
639        let attr = Attribute {
640            name: "units".to_string(),
641            datatype: Datatype::String {
642                size: StringSize::Fixed(10),
643                encoding: StringEncoding::Ascii,
644                padding: StringPadding::NullPad,
645            },
646            shape: vec![],
647            raw_data: b"meters\0\0\0\0".to_vec(),
648        };
649        assert_eq!(attr.read_string().unwrap(), "meters");
650    }
651
652    #[test]
653    fn test_varlen_byte_string_attribute() {
654        let attr = Attribute {
655            name: "name".to_string(),
656            datatype: Datatype::VarLen {
657                base: Box::new(Datatype::FixedPoint {
658                    size: 1,
659                    signed: false,
660                    byte_order: ByteOrder::LittleEndian,
661                }),
662                kind: VarLenKind::String,
663                encoding: StringEncoding::Utf8,
664                padding: StringPadding::NullTerminate,
665            },
666            shape: vec![],
667            raw_data: b"test_dataset".to_vec(),
668        };
669        assert_eq!(attr.read_string().unwrap(), "test_dataset");
670    }
671
672    #[test]
673    fn test_read_as_f64_from_int() {
674        let raw_data = 42i32.to_le_bytes().to_vec();
675        let attr = Attribute {
676            name: "count".to_string(),
677            datatype: Datatype::FixedPoint {
678                size: 4,
679                signed: true,
680                byte_order: ByteOrder::LittleEndian,
681            },
682            shape: vec![],
683            raw_data,
684        };
685        let val = attr.read_as_f64().unwrap();
686        assert!((val - 42.0).abs() < 1e-10);
687    }
688
689    #[test]
690    fn test_dense_attribute_btree_errors_surface() {
691        let info = AttributeInfoMessage {
692            creation_order_tracked: false,
693            creation_order_indexed: false,
694            max_creation_index: None,
695            fractal_heap_address: 0,
696            btree_name_index_address: 0,
697            btree_creation_order_address: None,
698        };
699        let storage = BytesStorage::new(Vec::new());
700
701        let err = load_dense_attribute_records_storage(&info, &storage, 8, 8).unwrap_err();
702        assert!(matches!(err, Error::InvalidData(_)));
703        assert!(err.to_string().contains("dense attribute"));
704    }
705}