use std::collections::HashMap;
use hdf5_reader::group::Group;
use crate::error::{Error, Result};
use crate::types::{NcDimension, NcVariable};
use super::attributes;
use super::types::hdf5_to_nc_type;
fn leaf_name(name: &str) -> &str {
name.rsplit('/').next().unwrap_or(name)
}
pub fn extract_variables(
group: &Group,
dimensions: &[NcDimension],
dim_addr_map: &HashMap<u64, NcDimension>,
metadata_mode: crate::NcMetadataMode,
) -> Result<Vec<NcVariable>> {
let datasets = group.datasets()?;
extract_variables_from_datasets(&datasets, group, dimensions, dim_addr_map, metadata_mode)
}
pub fn extract_variables_from_datasets(
datasets: &[hdf5_reader::Dataset],
group: &Group,
dimensions: &[NcDimension],
dim_addr_map: &HashMap<u64, NcDimension>,
metadata_mode: crate::NcMetadataMode,
) -> Result<Vec<NcVariable>> {
let mut variables = Vec::new();
for ds in datasets {
if let Some(variable) =
extract_variable(ds, group, dimensions, dim_addr_map, metadata_mode)?
{
variables.push(variable);
}
}
Ok(variables)
}
pub fn extract_variable(
ds: &hdf5_reader::Dataset,
group: &Group,
dimensions: &[NcDimension],
dim_addr_map: &HashMap<u64, NcDimension>,
metadata_mode: crate::NcMetadataMode,
) -> Result<Option<NcVariable>> {
let strict = metadata_mode == crate::NcMetadataMode::Strict;
let is_dim_scale = match ds.attribute("CLASS") {
Ok(attr) => match attr.read_string() {
Ok(value) => value == "DIMENSION_SCALE",
Err(err) if strict => {
return Err(Error::InvalidData(format!(
"dataset '{}' has unreadable CLASS attribute: {err}",
ds.name()
)))
}
Err(_) => false,
},
Err(_) => false,
};
if is_dim_scale && is_dimension_only_scale(ds, strict)? {
return Ok(None);
}
let nc_type = match hdf5_to_nc_type(ds.dtype()) {
Ok(t) => t,
Err(err) if strict => {
return Err(Error::InvalidData(format!(
"dataset '{}' uses unsupported NetCDF-4 type: {err}",
ds.name()
)))
}
Err(_) => return Ok(None),
};
let var_dims = if is_dim_scale {
resolve_coordinate_variable_dimensions(ds, dim_addr_map, metadata_mode)?
} else {
resolve_variable_dimensions(ds, group, dimensions, dim_addr_map, metadata_mode)?
};
let is_unlimited = var_dims.iter().any(|d| d.is_unlimited);
let shape = ds.shape();
let (data_size, record_size) =
compute_storage_sizes(shape, nc_type.size() as u64, is_unlimited)?;
let var_attrs = attributes::extract_variable_attributes(ds, metadata_mode)?;
Ok(Some(NcVariable {
name: leaf_name(ds.name()).to_string(),
dimensions: var_dims,
dtype: nc_type,
attributes: var_attrs,
data_offset: ds.address(),
_data_size: data_size,
is_record_var: is_unlimited,
record_size,
}))
}
fn is_dimension_only_scale(ds: &hdf5_reader::Dataset, strict: bool) -> Result<bool> {
match ds.attribute("NAME") {
Ok(attr) => match attr.read_string() {
Ok(value) => Ok(super::dimensions::is_dimension_without_variable_name(
&value,
)),
Err(err) if strict => Err(Error::InvalidData(format!(
"dimension scale '{}' has unreadable NAME attribute: {err}",
ds.name()
))),
Err(_) => Ok(false),
},
Err(_) => Ok(false),
}
}
fn resolve_coordinate_variable_dimensions(
ds: &hdf5_reader::Dataset,
dim_addr_map: &HashMap<u64, NcDimension>,
metadata_mode: crate::NcMetadataMode,
) -> Result<Vec<NcDimension>> {
if let Some(dim) = dim_addr_map.get(&ds.address()) {
Ok(vec![dim.clone()])
} else if metadata_mode == crate::NcMetadataMode::Lossy {
Ok(vec![NcDimension {
name: leaf_name(ds.name()).to_string(),
size: ds.shape().first().copied().unwrap_or(0),
is_unlimited: ds
.max_dims()
.is_some_and(|md| !md.is_empty() && md[0] == u64::MAX),
}])
} else {
Err(Error::InvalidData(format!(
"coordinate variable '{}' is not registered as a dimension scale",
ds.name()
)))
}
}
fn resolve_variable_dimensions(
ds: &hdf5_reader::Dataset,
group: &Group,
dimensions: &[NcDimension],
dim_addr_map: &HashMap<u64, NcDimension>,
metadata_mode: crate::NcMetadataMode,
) -> Result<Vec<NcDimension>> {
resolve_variable_dimensions_with_mode(ds, group, dimensions, dim_addr_map, metadata_mode)
}
fn resolve_variable_dimensions_with_mode(
ds: &hdf5_reader::Dataset,
group: &Group,
dimensions: &[NcDimension],
dim_addr_map: &HashMap<u64, NcDimension>,
metadata_mode: crate::NcMetadataMode,
) -> Result<Vec<NcDimension>> {
match resolve_variable_dimensions_from_dimlist(ds, group, dim_addr_map) {
Ok(dims) => Ok(dims),
Err(_err) if metadata_mode == crate::NcMetadataMode::Lossy => {
Ok(resolve_variable_dimensions_by_size(ds, dimensions))
}
Err(err) => Err(err),
}
}
fn resolve_variable_dimensions_from_dimlist(
ds: &hdf5_reader::Dataset,
group: &Group,
dim_addr_map: &HashMap<u64, NcDimension>,
) -> Result<Vec<NcDimension>> {
let dim_addrs = resolve_dimension_scale_addresses(ds, group)?;
let mut var_dims = Vec::with_capacity(dim_addrs.len());
for dim_addr in dim_addrs {
if let Some(dim) = dim_addr_map.get(&dim_addr) {
var_dims.push(dim.clone());
} else {
return Err(Error::InvalidData(format!(
"dataset '{}' references unknown dimension scale address {dim_addr:#x}",
ds.name()
)));
}
}
if let Some(max_dims) = ds.max_dims() {
for (i, md) in max_dims.iter().enumerate() {
if *md == u64::MAX && i < var_dims.len() {
var_dims[i].is_unlimited = true;
}
}
}
Ok(var_dims)
}
pub(crate) fn resolve_dimension_scale_addresses(
ds: &hdf5_reader::Dataset,
group: &Group,
) -> Result<Vec<u64>> {
let attr = ds.attribute("DIMENSION_LIST").map_err(|_| {
Error::InvalidData(format!(
"dataset '{}' is missing required DIMENSION_LIST metadata",
ds.name()
))
})?;
let raw_data = &attr.raw_data;
let ndim = ds.ndim();
let offset_size = group.offset_size();
if ndim == 0 {
return Ok(Vec::new());
}
if raw_data.is_empty() {
return Err(Error::InvalidData(format!(
"dataset '{}' has empty DIMENSION_LIST metadata",
ds.name()
)));
}
let entry_size = 4 + usize::from(offset_size) + 4;
if raw_data.len() < ndim * entry_size {
return Err(Error::InvalidData(format!(
"dataset '{}' has truncated DIMENSION_LIST metadata",
ds.name()
)));
}
let mut dim_addrs = Vec::with_capacity(ndim);
let mut cursor = hdf5_reader::io::Cursor::new(raw_data);
for _ in 0..ndim {
let seq_len = cursor.read_u32_le().map_err(|err| {
Error::InvalidData(format!(
"dataset '{}' has invalid DIMENSION_LIST entry count: {err}",
ds.name()
))
})? as usize;
let heap_addr = cursor.read_offset(offset_size).map_err(|err| {
Error::InvalidData(format!(
"dataset '{}' has invalid DIMENSION_LIST heap address: {err}",
ds.name()
))
})?;
let heap_idx = cursor.read_u32_le().map_err(|err| {
Error::InvalidData(format!(
"dataset '{}' has invalid DIMENSION_LIST heap index: {err}",
ds.name()
))
})? as u16;
if seq_len == 0 || hdf5_reader::io::Cursor::is_undefined_offset(heap_addr, offset_size) {
return Err(Error::InvalidData(format!(
"dataset '{}' has an unresolved DIMENSION_LIST reference",
ds.name()
)));
}
let collection = hdf5_reader::global_heap::GlobalHeapCollection::parse_at_storage(
group.storage(),
heap_addr,
offset_size,
group.length_size(),
)
.map_err(|err| {
Error::InvalidData(format!(
"dataset '{}' has unreadable DIMENSION_LIST heap object: {err}",
ds.name()
))
})?;
let heap_obj = collection.get_object(heap_idx).ok_or_else(|| {
Error::InvalidData(format!(
"dataset '{}' references missing DIMENSION_LIST heap object {}",
ds.name(),
heap_idx
))
})?;
let refs = hdf5_reader::reference::read_object_references(&heap_obj.data, offset_size)
.map_err(|err| {
Error::InvalidData(format!(
"dataset '{}' has invalid DIMENSION_LIST references: {err}",
ds.name()
))
})?;
if refs.is_empty() {
return Err(Error::InvalidData(format!(
"dataset '{}' has empty DIMENSION_LIST references",
ds.name()
)));
}
dim_addrs.push(refs[0]);
}
Ok(dim_addrs)
}
fn compute_storage_sizes(shape: &[u64], elem_size: u64, is_unlimited: bool) -> Result<(u64, u64)> {
let total_elements =
crate::types::checked_shape_elements(shape, "NetCDF-4 variable element count")?;
let data_size = crate::types::checked_mul_u64(
total_elements,
elem_size,
"NetCDF-4 variable size in bytes",
)?;
let record_elements = if is_unlimited && shape.len() > 1 {
crate::types::checked_shape_elements(&shape[1..], "NetCDF-4 record element count")?
} else {
1
};
let record_size =
crate::types::checked_mul_u64(record_elements, elem_size, "NetCDF-4 record size in bytes")?;
Ok((data_size, record_size))
}
fn resolve_variable_dimensions_by_size(
ds: &hdf5_reader::Dataset,
dimensions: &[NcDimension],
) -> Vec<NcDimension> {
let shape = ds.shape();
let mut var_dims = Vec::with_capacity(shape.len());
let mut used = vec![false; dimensions.len()];
for &dim_size in shape {
let mut matched = false;
for (i, dim) in dimensions.iter().enumerate() {
if !used[i] && dim.size == dim_size {
var_dims.push(dim.clone());
used[i] = true;
matched = true;
break;
}
}
if !matched {
var_dims.push(NcDimension {
name: format!("dim_{}", dim_size),
size: dim_size,
is_unlimited: false,
});
}
}
if let Some(max_dims) = ds.max_dims() {
for (i, md) in max_dims.iter().enumerate() {
if *md == u64::MAX && i < var_dims.len() {
var_dims[i].is_unlimited = true;
}
}
}
var_dims
}
#[cfg(test)]
mod tests {
use super::compute_storage_sizes;
#[test]
fn test_compute_storage_sizes_detects_overflow() {
let err = compute_storage_sizes(&[u64::MAX, 2], 8, false).unwrap_err();
assert!(matches!(err, crate::Error::InvalidData(_)));
}
#[test]
fn test_compute_storage_sizes_record_dims() {
let (data_size, record_size) = compute_storage_sizes(&[10, 3, 4], 4, true).unwrap();
assert_eq!(data_size, 480);
assert_eq!(record_size, 48);
}
}