1use crate::error::{Error, Result};
2use crate::filters::FilterRegistry;
3use crate::fractal_heap::FractalHeap;
4use crate::global_heap::GlobalHeapCollection;
5use crate::io::Cursor;
6use crate::messages::attribute::AttributeMessage;
7use crate::messages::attribute_info::AttributeInfoMessage;
8use crate::messages::dataspace::DataspaceType;
9use crate::messages::datatype::{Datatype, StringEncoding, StringPadding, StringSize, VarLenKind};
10use crate::messages::HdfMessage;
11use crate::object_header::ObjectHeader;
12use crate::storage::Storage;
13use crate::{btree_v2, messages};
14
15#[derive(Debug, Clone)]
17pub struct Attribute {
18 pub name: String,
19 pub datatype: Datatype,
20 pub shape: Vec<u64>,
21 pub raw_data: Vec<u8>,
22}
23
24impl Attribute {
25 pub fn from_message(msg: AttributeMessage) -> Self {
27 Self::from_message_with_context(msg, None, 0)
28 }
29
30 pub fn from_message_with_context(
33 msg: AttributeMessage,
34 file_data: Option<&[u8]>,
35 offset_size: u8,
36 ) -> Self {
37 let shape = match msg.dataspace.dataspace_type {
38 DataspaceType::Scalar => vec![],
39 DataspaceType::Null => vec![0],
40 DataspaceType::Simple => msg.dataspace.dims.clone(),
41 };
42 let raw_data = if let (Some(file_data), Datatype::VarLen { base, kind, .. }) =
43 (file_data, &msg.datatype)
44 {
45 if *kind == VarLenKind::String && is_byte_vlen(base) && shape.is_empty() {
46 resolve_vlen_bytes(&msg.raw_data, file_data, offset_size)
47 .unwrap_or_else(|| msg.raw_data.clone())
48 } else {
49 msg.raw_data.clone()
50 }
51 } else {
52 msg.raw_data.clone()
53 };
54 Attribute {
55 name: msg.name,
56 datatype: msg.datatype,
57 shape,
58 raw_data,
59 }
60 }
61
62 pub fn num_elements(&self) -> u64 {
64 if self.shape.is_empty() {
65 1 } else {
67 self.shape.iter().product()
68 }
69 }
70
71 pub fn read_scalar<T: crate::datatype_api::H5Type>(&self) -> Result<T> {
73 T::from_bytes(&self.raw_data, &self.datatype)
74 }
75
76 pub fn read_1d<T: crate::datatype_api::H5Type>(&self) -> Result<Vec<T>> {
78 let elem_size = T::element_size(&self.datatype);
79 let n = self.num_elements() as usize;
80 let mut result = Vec::with_capacity(n);
81 for i in 0..n {
82 let start = i * elem_size;
83 let end = start + elem_size;
84 if end > self.raw_data.len() {
85 return Err(Error::InvalidData(format!(
86 "attribute data too short: need {} bytes, have {}",
87 end,
88 self.raw_data.len()
89 )));
90 }
91 result.push(T::from_bytes(&self.raw_data[start..end], &self.datatype)?);
92 }
93 Ok(result)
94 }
95
96 pub fn read_string(&self) -> Result<String> {
101 match &self.datatype {
102 Datatype::VarLen {
103 base,
104 kind: VarLenKind::String,
105 ..
106 } if is_byte_vlen(base) => decode_varlen_byte_string(&self.raw_data),
107 Datatype::String {
108 size,
109 encoding,
110 padding,
111 } => match size {
112 StringSize::Fixed(len) => {
113 let len = *len as usize;
114 let bytes = if self.raw_data.len() < len {
115 &self.raw_data
116 } else {
117 &self.raw_data[..len]
118 };
119 decode_string(bytes, *padding, *encoding)
120 }
121 StringSize::Variable => {
122 if self.raw_data.len() >= 12 {
126 let trimmed = match padding {
128 StringPadding::NullTerminate => {
129 let end = self
130 .raw_data
131 .iter()
132 .position(|&b| b == 0)
133 .unwrap_or(self.raw_data.len());
134 &self.raw_data[..end]
135 }
136 _ => &self.raw_data,
137 };
138 if let Ok(s) = String::from_utf8(trimmed.to_vec()) {
139 if s.chars()
140 .all(|c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
141 {
142 return Ok(s);
143 }
144 }
145 }
146 decode_string(&self.raw_data, *padding, *encoding)
147 }
148 },
149 _ => Err(Error::TypeMismatch {
150 expected: "String".into(),
151 actual: format!("{:?}", self.datatype),
152 }),
153 }
154 }
155
156 pub fn read_vlen_string(&self, file_data: &[u8], offset_size: u8) -> Result<String> {
161 match &self.datatype {
162 Datatype::String {
163 size: StringSize::Variable,
164 encoding,
165 padding,
166 } => {
167 let ref_size = 4 + offset_size as usize + 4; if self.raw_data.len() < ref_size {
169 return decode_string(&self.raw_data, *padding, *encoding);
171 }
172 let bytes = read_one_vlen_string(
173 &self.raw_data,
174 0,
175 file_data,
176 offset_size,
177 *padding,
178 *encoding,
179 )?;
180 Ok(bytes)
181 }
182 Datatype::String {
183 size: StringSize::Fixed(_),
184 ..
185 } => self.read_string(),
186 _ => Err(Error::TypeMismatch {
187 expected: "String".into(),
188 actual: format!("{:?}", self.datatype),
189 }),
190 }
191 }
192
193 pub fn read_vlen_strings(&self, file_data: &[u8], offset_size: u8) -> Result<Vec<String>> {
195 match &self.datatype {
196 Datatype::String {
197 size: StringSize::Variable,
198 encoding,
199 padding,
200 } => {
201 let ref_size = 4 + offset_size as usize + 4;
202 let n = self.num_elements() as usize;
203 let mut result = Vec::with_capacity(n);
204 for i in 0..n {
205 let offset = i * ref_size;
206 if offset + ref_size > self.raw_data.len() {
207 break;
208 }
209 result.push(read_one_vlen_string(
210 &self.raw_data,
211 offset,
212 file_data,
213 offset_size,
214 *padding,
215 *encoding,
216 )?);
217 }
218 Ok(result)
219 }
220 Datatype::String {
221 size: StringSize::Fixed(_),
222 ..
223 } => self.read_strings(),
224 _ => Err(Error::TypeMismatch {
225 expected: "String array".into(),
226 actual: format!("{:?}", self.datatype),
227 }),
228 }
229 }
230
231 pub fn read_strings(&self) -> Result<Vec<String>> {
233 match &self.datatype {
234 Datatype::String {
235 size: StringSize::Fixed(len),
236 encoding,
237 padding,
238 } => {
239 let len = *len as usize;
240 let n = self.num_elements() as usize;
241 let mut result = Vec::with_capacity(n);
242 for i in 0..n {
243 let start = i * len;
244 let end = (start + len).min(self.raw_data.len());
245 if start >= self.raw_data.len() {
246 break;
247 }
248 result.push(decode_string(
249 &self.raw_data[start..end],
250 *padding,
251 *encoding,
252 )?);
253 }
254 Ok(result)
255 }
256 _ => Err(Error::TypeMismatch {
257 expected: "String array".into(),
258 actual: format!("{:?}", self.datatype),
259 }),
260 }
261 }
262
263 pub fn read_as_f64(&self) -> Result<f64> {
265 match &self.datatype {
266 Datatype::FloatingPoint { size, .. } => {
267 let val: f64 = match size {
268 4 => {
269 let v = self.read_scalar::<f32>()?;
270 v as f64
271 }
272 8 => self.read_scalar::<f64>()?,
273 _ => {
274 return Err(Error::TypeMismatch {
275 expected: "f32 or f64".into(),
276 actual: format!("FloatingPoint(size={})", size),
277 })
278 }
279 };
280 Ok(val)
281 }
282 Datatype::FixedPoint { size, signed, .. } => {
283 let val = match (size, signed) {
284 (1, true) => self.read_scalar::<i8>()? as f64,
285 (1, false) => self.read_scalar::<u8>()? as f64,
286 (2, true) => self.read_scalar::<i16>()? as f64,
287 (2, false) => self.read_scalar::<u16>()? as f64,
288 (4, true) => self.read_scalar::<i32>()? as f64,
289 (4, false) => self.read_scalar::<u32>()? as f64,
290 (8, true) => self.read_scalar::<i64>()? as f64,
291 (8, false) => self.read_scalar::<u64>()? as f64,
292 _ => {
293 return Err(Error::TypeMismatch {
294 expected: "numeric".into(),
295 actual: format!("FixedPoint(size={})", size),
296 })
297 }
298 };
299 Ok(val)
300 }
301 _ => Err(Error::TypeMismatch {
302 expected: "numeric".into(),
303 actual: format!("{:?}", self.datatype),
304 }),
305 }
306 }
307}
308
309pub(crate) fn collect_attribute_messages_storage(
310 header: &ObjectHeader,
311 storage: &dyn Storage,
312 offset_size: u8,
313 length_size: u8,
314 filter_registry: Option<&FilterRegistry>,
315) -> Result<Vec<AttributeMessage>> {
316 let mut attributes = Vec::new();
317 let mut attribute_info = None;
318
319 for msg in &header.messages {
320 match msg {
321 HdfMessage::Attribute(attr) => attributes.push(attr.clone()),
322 HdfMessage::AttributeInfo(info) => attribute_info = Some(info.clone()),
323 _ => {}
324 }
325 }
326
327 if let Some(info) = attribute_info {
328 attributes.extend(load_dense_attribute_messages_storage(
329 &info,
330 storage,
331 offset_size,
332 length_size,
333 filter_registry,
334 )?);
335 }
336
337 Ok(attributes)
338}
339
340fn load_dense_attribute_messages_storage(
341 info: &AttributeInfoMessage,
342 storage: &dyn Storage,
343 offset_size: u8,
344 length_size: u8,
345 filter_registry: Option<&FilterRegistry>,
346) -> Result<Vec<AttributeMessage>> {
347 if Cursor::is_undefined_offset(info.fractal_heap_address, offset_size) {
348 return Ok(Vec::new());
349 }
350
351 let heap = FractalHeap::parse_at_storage(
352 storage,
353 info.fractal_heap_address,
354 offset_size,
355 length_size,
356 )?;
357
358 let records = load_dense_attribute_records_storage(info, storage, offset_size, length_size)?;
359
360 let mut attributes = Vec::new();
361 for record in records {
362 let heap_id = match record {
363 btree_v2::BTreeV2Record::AttributeNameHash { heap_id, .. }
364 | btree_v2::BTreeV2Record::AttributeCreationOrder { heap_id, .. } => heap_id,
365 _ => continue,
366 };
367
368 let managed_bytes = heap.get_object_storage_with_registry(
369 &heap_id,
370 storage,
371 offset_size,
372 length_size,
373 filter_registry,
374 )?;
375
376 let mut attr_cursor = Cursor::new(&managed_bytes);
377 let attr = messages::attribute::parse(
378 &mut attr_cursor,
379 offset_size,
380 length_size,
381 managed_bytes.len(),
382 )?;
383 attributes.push(attr);
384 }
385
386 Ok(attributes)
387}
388
389fn load_dense_attribute_records_storage(
390 info: &AttributeInfoMessage,
391 storage: &dyn Storage,
392 offset_size: u8,
393 length_size: u8,
394) -> Result<Vec<btree_v2::BTreeV2Record>> {
395 let mut addrs = vec![("name", info.btree_name_index_address)];
396 if let Some(creation_order_addr) = info.btree_creation_order_address {
397 addrs.push(("creation-order", creation_order_addr));
398 }
399
400 let mut last_error = None;
401 for (index_name, addr) in addrs {
402 if Cursor::is_undefined_offset(addr, offset_size) {
403 continue;
404 }
405
406 let header = match btree_v2::BTreeV2Header::parse_at_storage(
407 storage,
408 addr,
409 offset_size,
410 length_size,
411 ) {
412 Ok(header) => header,
413 Err(err) => {
414 last_error = Some(format!(
415 "failed to parse dense attribute {index_name} B-tree at {addr:#x}: {err}"
416 ));
417 continue;
418 }
419 };
420
421 match btree_v2::collect_btree_v2_records_storage(
422 storage,
423 &header,
424 offset_size,
425 length_size,
426 None,
427 &[],
428 None,
429 ) {
430 Ok(records) => return Ok(records),
431 Err(err) => {
432 last_error = Some(format!(
433 "failed to read dense attribute {index_name} B-tree at {addr:#x}: {err}"
434 ));
435 }
436 }
437 }
438
439 if let Some(err) = last_error {
440 Err(Error::InvalidData(format!(
441 "failed to load dense attribute records: {err}"
442 )))
443 } else {
444 Ok(Vec::new())
445 }
446}
447
448pub(crate) fn read_one_vlen_string(
450 raw_data: &[u8],
451 offset: usize,
452 file_data: &[u8],
453 offset_size: u8,
454 padding: StringPadding,
455 encoding: StringEncoding,
456) -> Result<String> {
457 let mut cursor = Cursor::new(&raw_data[offset..]);
458 let _seq_len = cursor.read_u32_le()?;
459 let heap_addr = cursor.read_offset(offset_size)?;
460 let obj_index = cursor.read_u32_le()?;
461
462 if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
463 return Ok(String::new());
464 }
465
466 let mut heap_cursor = Cursor::new(file_data);
467 heap_cursor.set_position(heap_addr);
468 let collection = GlobalHeapCollection::parse(&mut heap_cursor, offset_size, offset_size)?;
469
470 match collection.get_object(obj_index as u16) {
471 Some(obj) => decode_string(&obj.data, padding, encoding),
472 None => Ok(String::new()),
473 }
474}
475
476pub(crate) fn read_one_vlen_string_storage(
477 raw_data: &[u8],
478 offset: usize,
479 storage: &dyn Storage,
480 offset_size: u8,
481 length_size: u8,
482 padding: StringPadding,
483 encoding: StringEncoding,
484) -> Result<String> {
485 let mut cursor = Cursor::new(&raw_data[offset..]);
486 let _seq_len = cursor.read_u32_le()?;
487 let heap_addr = cursor.read_offset(offset_size)?;
488 let obj_index = cursor.read_u32_le()?;
489
490 if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
491 return Ok(String::new());
492 }
493
494 let collection =
495 GlobalHeapCollection::parse_at_storage(storage, heap_addr, offset_size, length_size)?;
496 match collection.get_object(obj_index as u16) {
497 Some(obj) => decode_string(&obj.data, padding, encoding),
498 None => Ok(String::new()),
499 }
500}
501
502pub(crate) fn decode_string(
507 bytes: &[u8],
508 padding: StringPadding,
509 _encoding: StringEncoding,
510) -> Result<String> {
511 let trimmed = match padding {
512 StringPadding::NullTerminate => {
513 let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
514 &bytes[..end]
515 }
516 StringPadding::NullPad => {
517 let end = bytes.iter().rposition(|&b| b != 0).map_or(0, |i| i + 1);
518 &bytes[..end]
519 }
520 StringPadding::SpacePad => {
521 let end = bytes.iter().rposition(|&b| b != b' ').map_or(0, |i| i + 1);
522 &bytes[..end]
523 }
524 };
525
526 String::from_utf8(trimmed.to_vec())
527 .map_err(|e| Error::InvalidData(format!("invalid string data: {e}")))
528}
529
530fn is_byte_vlen(base: &Datatype) -> bool {
531 matches!(base, Datatype::FixedPoint { size: 1, .. })
532}
533
534pub(crate) fn decode_varlen_byte_string(bytes: &[u8]) -> Result<String> {
535 let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
536 String::from_utf8(bytes[..end].to_vec())
537 .map_err(|e| Error::InvalidData(format!("invalid string data: {e}")))
538}
539
540pub(crate) fn resolve_vlen_bytes(
541 raw_data: &[u8],
542 file_data: &[u8],
543 offset_size: u8,
544) -> Option<Vec<u8>> {
545 if raw_data.len() < 4 + offset_size as usize + 4 {
546 return None;
547 }
548
549 let mut cursor = Cursor::new(raw_data);
550 let seq_len = cursor.read_u32_le().ok()? as usize;
551 let heap_addr = cursor.read_offset(offset_size).ok()?;
552 let obj_index = cursor.read_u32_le().ok()? as u16;
553
554 if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
555 return Some(Vec::new());
556 }
557
558 let mut heap_cursor = Cursor::new(file_data);
559 heap_cursor.set_position(heap_addr);
560 let collection =
561 GlobalHeapCollection::parse(&mut heap_cursor, offset_size, offset_size).ok()?;
562 let object = collection.get_object(obj_index)?;
563 Some(object.data[..object.data.len().min(seq_len)].to_vec())
564}
565
566pub(crate) fn resolve_vlen_bytes_storage(
567 raw_data: &[u8],
568 storage: &dyn Storage,
569 offset_size: u8,
570 length_size: u8,
571) -> Option<Vec<u8>> {
572 if raw_data.len() < 4 + offset_size as usize + 4 {
573 return None;
574 }
575
576 let mut cursor = Cursor::new(raw_data);
577 let seq_len = cursor.read_u32_le().ok()? as usize;
578 let heap_addr = cursor.read_offset(offset_size).ok()?;
579 let obj_index = cursor.read_u32_le().ok()? as u16;
580
581 if Cursor::is_undefined_offset(heap_addr, offset_size) || obj_index == 0 {
582 return Some(Vec::new());
583 }
584
585 let collection =
586 GlobalHeapCollection::parse_at_storage(storage, heap_addr, offset_size, length_size)
587 .ok()?;
588 let object = collection.get_object(obj_index)?;
589 Some(object.data[..object.data.len().min(seq_len)].to_vec())
590}
591
592#[cfg(test)]
593mod tests {
594 use super::*;
595 use crate::error::ByteOrder;
596 use crate::storage::BytesStorage;
597 use std::f64::consts::PI;
598
599 #[test]
600 fn test_scalar_f64_attribute() {
601 let value: f64 = PI;
602 let raw_data = value.to_le_bytes().to_vec();
603 let attr = Attribute {
604 name: "pi".to_string(),
605 datatype: Datatype::FloatingPoint {
606 size: 8,
607 byte_order: ByteOrder::LittleEndian,
608 },
609 shape: vec![],
610 raw_data,
611 };
612 let val = attr.read_scalar::<f64>().unwrap();
613 assert!((val - PI).abs() < 1e-10);
614 }
615
616 #[test]
617 fn test_1d_i32_attribute() {
618 let values = [1i32, 2, 3, 4];
619 let mut raw_data = Vec::new();
620 for v in &values {
621 raw_data.extend_from_slice(&v.to_le_bytes());
622 }
623 let attr = Attribute {
624 name: "data".to_string(),
625 datatype: Datatype::FixedPoint {
626 size: 4,
627 signed: true,
628 byte_order: ByteOrder::LittleEndian,
629 },
630 shape: vec![4],
631 raw_data,
632 };
633 let result = attr.read_1d::<i32>().unwrap();
634 assert_eq!(result, vec![1, 2, 3, 4]);
635 }
636
637 #[test]
638 fn test_string_attribute() {
639 let attr = Attribute {
640 name: "units".to_string(),
641 datatype: Datatype::String {
642 size: StringSize::Fixed(10),
643 encoding: StringEncoding::Ascii,
644 padding: StringPadding::NullPad,
645 },
646 shape: vec![],
647 raw_data: b"meters\0\0\0\0".to_vec(),
648 };
649 assert_eq!(attr.read_string().unwrap(), "meters");
650 }
651
652 #[test]
653 fn test_varlen_byte_string_attribute() {
654 let attr = Attribute {
655 name: "name".to_string(),
656 datatype: Datatype::VarLen {
657 base: Box::new(Datatype::FixedPoint {
658 size: 1,
659 signed: false,
660 byte_order: ByteOrder::LittleEndian,
661 }),
662 kind: VarLenKind::String,
663 encoding: StringEncoding::Utf8,
664 padding: StringPadding::NullTerminate,
665 },
666 shape: vec![],
667 raw_data: b"test_dataset".to_vec(),
668 };
669 assert_eq!(attr.read_string().unwrap(), "test_dataset");
670 }
671
672 #[test]
673 fn test_read_as_f64_from_int() {
674 let raw_data = 42i32.to_le_bytes().to_vec();
675 let attr = Attribute {
676 name: "count".to_string(),
677 datatype: Datatype::FixedPoint {
678 size: 4,
679 signed: true,
680 byte_order: ByteOrder::LittleEndian,
681 },
682 shape: vec![],
683 raw_data,
684 };
685 let val = attr.read_as_f64().unwrap();
686 assert!((val - 42.0).abs() < 1e-10);
687 }
688
689 #[test]
690 fn test_dense_attribute_btree_errors_surface() {
691 let info = AttributeInfoMessage {
692 creation_order_tracked: false,
693 creation_order_indexed: false,
694 max_creation_index: None,
695 fractal_heap_address: 0,
696 btree_name_index_address: 0,
697 btree_creation_order_address: None,
698 };
699 let storage = BytesStorage::new(Vec::new());
700
701 let err = load_dense_attribute_records_storage(&info, &storage, 8, 8).unwrap_err();
702 assert!(matches!(err, Error::InvalidData(_)));
703 assert!(err.to_string().contains("dense attribute"));
704 }
705}