use crate::dtype::DType;
use crate::shape::Shape;
#[cfg(not(feature = "std"))]
use alloc::{
string::{String, ToString},
vec::Vec,
};
#[cfg(feature = "std")]
use std::collections::HashMap;
#[cfg(not(feature = "std"))]
use alloc::collections::BTreeMap as HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Hdf5TypeClass {
Integer,
Float,
String,
Bitfield,
Opaque,
Compound,
Reference,
Enum,
VarLen,
Array,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Hdf5ByteOrder {
LittleEndian,
BigEndian,
Native,
}
#[derive(Debug, Clone)]
pub struct Hdf5Datatype {
pub class: Hdf5TypeClass,
pub size: usize,
pub byte_order: Hdf5ByteOrder,
pub sign: Option<bool>,
pub precision: Option<usize>,
pub offset: Option<usize>,
}
impl Hdf5Datatype {
pub fn from_dtype(dtype: DType) -> Self {
match dtype {
DType::F32 => Self {
class: Hdf5TypeClass::Float,
size: 4,
byte_order: Hdf5ByteOrder::Native,
sign: None,
precision: Some(23),
offset: Some(0),
},
DType::F64 => Self {
class: Hdf5TypeClass::Float,
size: 8,
byte_order: Hdf5ByteOrder::Native,
sign: None,
precision: Some(52),
offset: Some(0),
},
DType::I8 => Self {
class: Hdf5TypeClass::Integer,
size: 1,
byte_order: Hdf5ByteOrder::Native,
sign: Some(true),
precision: Some(8),
offset: Some(0),
},
DType::I16 => Self {
class: Hdf5TypeClass::Integer,
size: 2,
byte_order: Hdf5ByteOrder::Native,
sign: Some(true),
precision: Some(16),
offset: Some(0),
},
DType::I32 => Self {
class: Hdf5TypeClass::Integer,
size: 4,
byte_order: Hdf5ByteOrder::Native,
sign: Some(true),
precision: Some(32),
offset: Some(0),
},
DType::I64 => Self {
class: Hdf5TypeClass::Integer,
size: 8,
byte_order: Hdf5ByteOrder::Native,
sign: Some(true),
precision: Some(64),
offset: Some(0),
},
DType::U8 => Self {
class: Hdf5TypeClass::Integer,
size: 1,
byte_order: Hdf5ByteOrder::Native,
sign: Some(false),
precision: Some(8),
offset: Some(0),
},
DType::Bool => Self {
class: Hdf5TypeClass::Integer,
size: 1,
byte_order: Hdf5ByteOrder::Native,
sign: Some(false),
precision: Some(1),
offset: Some(0),
},
_ => Self {
class: Hdf5TypeClass::Opaque,
size: dtype.size(),
byte_order: Hdf5ByteOrder::Native,
sign: None,
precision: None,
offset: None,
},
}
}
pub fn is_float(&self) -> bool {
matches!(self.class, Hdf5TypeClass::Float)
}
pub fn is_integer(&self) -> bool {
matches!(self.class, Hdf5TypeClass::Integer)
}
pub fn is_signed(&self) -> bool {
self.sign.unwrap_or(false)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Hdf5Filter {
None,
Gzip { level: u8 },
Szip,
Lzf,
Shuffle,
Fletcher32,
Bzip2 { level: u8 },
Lz4,
Blosc {
compressor: BloscCompressor,
level: u8,
shuffle: BloscShuffle,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum BloscCompressor {
BloscLz,
Lz4,
Lz4Hc,
Snappy,
Zlib,
Zstd,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum BloscShuffle {
None,
Byte,
Bit,
}
impl Hdf5Filter {
pub fn compression_ratio_estimate(&self) -> f32 {
match self {
Self::None | Self::Shuffle | Self::Fletcher32 => 1.0,
Self::Gzip { level } => 2.0 + (*level as f32 / 9.0) * 3.0, Self::Szip => 2.5,
Self::Lzf => 1.8,
Self::Bzip2 { level } => 2.5 + (*level as f32 / 9.0) * 2.5, Self::Lz4 => 1.5,
Self::Blosc { level, .. } => 2.0 + (*level as f32 / 9.0) * 3.0, }
}
pub fn is_lossy(&self) -> bool {
false }
}
#[derive(Debug, Clone)]
pub struct Hdf5Chunking {
pub chunk_dims: Vec<usize>,
pub cache_size: Option<usize>,
pub cache_slots: Option<usize>,
pub cache_w0: Option<f32>,
}
impl Hdf5Chunking {
pub fn new(chunk_dims: Vec<usize>) -> Self {
Self {
chunk_dims,
cache_size: None,
cache_slots: None,
cache_w0: None,
}
}
pub fn with_cache_size(mut self, size: usize) -> Self {
self.cache_size = Some(size);
self
}
pub fn with_cache_slots(mut self, slots: usize) -> Self {
self.cache_slots = Some(slots);
self
}
pub fn with_cache_w0(mut self, w0: f32) -> Self {
self.cache_w0 = Some(w0.clamp(0.0, 1.0));
self
}
pub fn chunk_size_bytes(&self, dtype: &Hdf5Datatype) -> usize {
let elements: usize = self.chunk_dims.iter().product();
elements * dtype.size
}
pub fn validate(&self, shape: &Shape) -> bool {
if self.chunk_dims.len() != shape.ndim() {
return false;
}
for (chunk_dim, shape_dim) in self.chunk_dims.iter().zip(shape.dims()) {
if *chunk_dim > *shape_dim || *chunk_dim == 0 {
return false;
}
}
true
}
pub fn estimate_num_chunks(&self, shape: &Shape) -> usize {
if !self.validate(shape) {
return 0;
}
shape
.dims()
.iter()
.zip(self.chunk_dims.iter())
.map(|(dim, chunk)| (dim + chunk - 1) / chunk)
.product()
}
}
#[derive(Debug, Clone)]
pub enum Hdf5AttributeValue {
String(String),
Int(i64),
Float(f64),
IntArray(Vec<i64>),
FloatArray(Vec<f64>),
StringArray(Vec<String>),
}
impl Hdf5AttributeValue {
pub fn type_name(&self) -> &str {
match self {
Self::String(_) => "string",
Self::Int(_) => "int",
Self::Float(_) => "float",
Self::IntArray(_) => "int_array",
Self::FloatArray(_) => "float_array",
Self::StringArray(_) => "string_array",
}
}
pub fn size_bytes(&self) -> usize {
match self {
Self::String(s) => s.len(),
Self::Int(_) => 8,
Self::Float(_) => 8,
Self::IntArray(arr) => arr.len() * 8,
Self::FloatArray(arr) => arr.len() * 8,
Self::StringArray(arr) => arr.iter().map(|s| s.len()).sum::<usize>(),
}
}
}
#[derive(Debug, Clone)]
pub struct Hdf5DatasetMetadata {
pub name: String,
pub shape: Shape,
pub dtype: Hdf5Datatype,
pub chunking: Option<Hdf5Chunking>,
pub filters: Vec<Hdf5Filter>,
pub attributes: HashMap<String, Hdf5AttributeValue>,
pub fill_value: Option<Vec<u8>>,
pub track_times: bool,
}
impl Hdf5DatasetMetadata {
pub fn new(name: String, shape: Shape, dtype: DType) -> Self {
Self {
name,
shape,
dtype: Hdf5Datatype::from_dtype(dtype),
chunking: None,
filters: Vec::new(),
attributes: HashMap::new(),
fill_value: None,
track_times: true,
}
}
pub fn with_chunking(mut self, chunking: Hdf5Chunking) -> Self {
self.chunking = Some(chunking);
self
}
pub fn with_filter(mut self, filter: Hdf5Filter) -> Self {
self.filters.push(filter);
self
}
pub fn with_attribute(mut self, key: String, value: Hdf5AttributeValue) -> Self {
self.attributes.insert(key, value);
self
}
pub fn with_fill_value(mut self, value: Vec<u8>) -> Self {
self.fill_value = Some(value);
self
}
pub fn with_track_times(mut self, track: bool) -> Self {
self.track_times = track;
self
}
pub fn dataset_size_bytes(&self) -> usize {
self.shape.numel() * self.dtype.size
}
pub fn compressed_size_estimate(&self) -> usize {
let base_size = self.dataset_size_bytes();
if self.filters.is_empty() {
return base_size;
}
let compression_ratio: f32 = self
.filters
.iter()
.map(|f| f.compression_ratio_estimate())
.product();
(base_size as f32 / compression_ratio) as usize
}
pub fn metadata_size_bytes(&self) -> usize {
let mut size = 0;
size += self.name.len();
size += self.shape.ndim() * 8 + 16;
if let Some(ref chunking) = self.chunking {
size += chunking.chunk_dims.len() * 8 + 16;
}
size += self.filters.len() * 4;
for (key, value) in &self.attributes {
size += key.len() + value.size_bytes();
}
size
}
}
#[derive(Debug, Clone)]
pub struct Hdf5DimensionScale {
pub name: String,
pub values: Vec<f64>,
pub label: Option<String>,
pub units: Option<String>,
}
impl Hdf5DimensionScale {
pub fn new(name: String, values: Vec<f64>) -> Self {
Self {
name,
values,
label: None,
units: None,
}
}
pub fn with_label(mut self, label: String) -> Self {
self.label = Some(label);
self
}
pub fn with_units(mut self, units: String) -> Self {
self.units = Some(units);
self
}
pub fn is_uniform(&self) -> bool {
if self.values.len() < 2 {
return true;
}
let delta = self.values[1] - self.values[0];
let epsilon = delta.abs() * 1e-6;
for i in 2..self.values.len() {
let current_delta = self.values[i] - self.values[i - 1];
if (current_delta - delta).abs() > epsilon {
return false;
}
}
true
}
pub fn step(&self) -> Option<f64> {
if self.values.len() < 2 {
return None;
}
if self.is_uniform() {
Some(self.values[1] - self.values[0])
} else {
None
}
}
}
#[derive(Debug, Clone)]
pub struct Hdf5GroupMetadata {
pub name: String,
pub datasets: Vec<String>,
pub groups: Vec<String>,
pub attributes: HashMap<String, Hdf5AttributeValue>,
}
impl Hdf5GroupMetadata {
pub fn new(name: String) -> Self {
Self {
name,
datasets: Vec::new(),
groups: Vec::new(),
attributes: HashMap::new(),
}
}
pub fn add_dataset(mut self, dataset: String) -> Self {
self.datasets.push(dataset);
self
}
pub fn add_group(mut self, group: String) -> Self {
self.groups.push(group);
self
}
pub fn with_attribute(mut self, key: String, value: Hdf5AttributeValue) -> Self {
self.attributes.insert(key, value);
self
}
pub fn num_items(&self) -> usize {
self.datasets.len() + self.groups.len()
}
}
#[derive(Debug, Clone)]
pub struct Hdf5FileMetadata {
pub version: (u8, u8),
pub root: Hdf5GroupMetadata,
pub user_block_size: Option<usize>,
pub creation_properties: HashMap<String, String>,
}
impl Hdf5FileMetadata {
pub fn new() -> Self {
Self {
version: (1, 10), root: Hdf5GroupMetadata::new("/".to_string()),
user_block_size: None,
creation_properties: HashMap::new(),
}
}
pub fn with_version(mut self, major: u8, minor: u8) -> Self {
self.version = (major, minor);
self
}
pub fn with_user_block(mut self, size: usize) -> Self {
self.user_block_size = Some(size);
self
}
pub fn with_property(mut self, key: String, value: String) -> Self {
self.creation_properties.insert(key, value);
self
}
pub fn version_string(&self) -> String {
format!("{}.{}", self.version.0, self.version.1)
}
}
impl Default for Hdf5FileMetadata {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hdf5_datatype_from_dtype() {
let dtype = Hdf5Datatype::from_dtype(DType::F32);
assert_eq!(dtype.class, Hdf5TypeClass::Float);
assert_eq!(dtype.size, 4);
assert!(dtype.is_float());
assert!(!dtype.is_integer());
let dtype = Hdf5Datatype::from_dtype(DType::I32);
assert_eq!(dtype.class, Hdf5TypeClass::Integer);
assert_eq!(dtype.size, 4);
assert!(dtype.is_integer());
assert!(dtype.is_signed());
}
#[test]
fn test_hdf5_filter_compression_ratio() {
let gzip = Hdf5Filter::Gzip { level: 9 };
assert!(gzip.compression_ratio_estimate() > 2.0);
let lz4 = Hdf5Filter::Lz4;
assert_eq!(lz4.compression_ratio_estimate(), 1.5);
assert!(!gzip.is_lossy());
}
#[test]
fn test_hdf5_chunking() {
let shape = Shape::new(vec![100, 200, 300]);
let chunking = Hdf5Chunking::new(vec![10, 20, 30])
.with_cache_size(1024 * 1024)
.with_cache_slots(521);
assert!(chunking.validate(&shape));
assert_eq!(chunking.estimate_num_chunks(&shape), 10 * 10 * 10);
let dtype = Hdf5Datatype::from_dtype(DType::F32);
assert_eq!(chunking.chunk_size_bytes(&dtype), 10 * 20 * 30 * 4);
}
#[test]
fn test_hdf5_dataset_metadata() {
let shape = Shape::new(vec![100, 200]);
let metadata = Hdf5DatasetMetadata::new("test_dataset".to_string(), shape, DType::F32)
.with_chunking(Hdf5Chunking::new(vec![10, 20]))
.with_filter(Hdf5Filter::Gzip { level: 6 })
.with_attribute(
"description".to_string(),
Hdf5AttributeValue::String("Test dataset".to_string()),
);
assert_eq!(metadata.dataset_size_bytes(), 100 * 200 * 4);
assert!(metadata.compressed_size_estimate() < metadata.dataset_size_bytes());
}
#[test]
fn test_dimension_scale() {
let scale = Hdf5DimensionScale::new("time".to_string(), vec![0.0, 1.0, 2.0, 3.0, 4.0])
.with_units("seconds".to_string());
assert!(scale.is_uniform());
assert_eq!(scale.step(), Some(1.0));
let non_uniform = Hdf5DimensionScale::new("x".to_string(), vec![0.0, 1.0, 3.0, 6.0]);
assert!(!non_uniform.is_uniform());
assert_eq!(non_uniform.step(), None);
}
#[test]
fn test_hdf5_group_metadata() {
let group = Hdf5GroupMetadata::new("/data".to_string())
.add_dataset("tensor1".to_string())
.add_dataset("tensor2".to_string())
.add_group("subgroup".to_string())
.with_attribute(
"created".to_string(),
Hdf5AttributeValue::String("2025-10-04".to_string()),
);
assert_eq!(group.num_items(), 3);
assert_eq!(group.datasets.len(), 2);
assert_eq!(group.groups.len(), 1);
}
#[test]
fn test_hdf5_file_metadata() {
let file = Hdf5FileMetadata::new()
.with_version(1, 12)
.with_user_block(512)
.with_property("library".to_string(), "torsh".to_string());
assert_eq!(file.version_string(), "1.12");
assert_eq!(file.user_block_size, Some(512));
}
}