Skip to main content

netcdf_reader/nc4/
variables.rs

1//! Map HDF5 datasets to NetCDF-4 variables.
2//!
3//! Each HDF5 dataset that is NOT a dimension scale becomes an NcVariable.
4//! The variable's dimensions are determined from the `DIMENSION_LIST` attribute,
5//! which contains object references to the corresponding dimension-scale datasets.
6
7use std::collections::HashMap;
8
9use hdf5_reader::group::Group;
10
11use crate::error::{Error, Result};
12use crate::types::{NcDimension, NcVariable};
13
14use super::attributes;
15use super::types::hdf5_to_nc_type;
16
17fn leaf_name(name: &str) -> &str {
18    name.rsplit('/').next().unwrap_or(name)
19}
20
21/// Extract variables from an HDF5 group.
22///
23/// Datasets with `CLASS=DIMENSION_SCALE` are dimensions, not variables.
24/// All other datasets become NcVariables.
25///
26/// `dim_addr_map` maps dimension-scale dataset addresses to their `NcDimension`,
27/// used to resolve `DIMENSION_LIST` object references.
28pub fn extract_variables(
29    group: &Group,
30    dimensions: &[NcDimension],
31    dim_addr_map: &HashMap<u64, NcDimension>,
32    metadata_mode: crate::NcMetadataMode,
33) -> Result<Vec<NcVariable>> {
34    let datasets = group.datasets()?;
35    extract_variables_from_datasets(&datasets, group, dimensions, dim_addr_map, metadata_mode)
36}
37
38pub fn extract_variables_from_datasets(
39    datasets: &[hdf5_reader::Dataset],
40    group: &Group,
41    dimensions: &[NcDimension],
42    dim_addr_map: &HashMap<u64, NcDimension>,
43    metadata_mode: crate::NcMetadataMode,
44) -> Result<Vec<NcVariable>> {
45    let mut variables = Vec::new();
46
47    for ds in datasets {
48        if let Some(variable) =
49            extract_variable(ds, group, dimensions, dim_addr_map, metadata_mode)?
50        {
51            variables.push(variable);
52        }
53    }
54
55    Ok(variables)
56}
57
58pub fn extract_variable(
59    ds: &hdf5_reader::Dataset,
60    group: &Group,
61    dimensions: &[NcDimension],
62    dim_addr_map: &HashMap<u64, NcDimension>,
63    metadata_mode: crate::NcMetadataMode,
64) -> Result<Option<NcVariable>> {
65    let strict = metadata_mode == crate::NcMetadataMode::Strict;
66
67    let is_dim_scale = match ds.attribute("CLASS") {
68        Ok(attr) => match attr.read_string() {
69            Ok(value) => value == "DIMENSION_SCALE",
70            Err(err) if strict => {
71                return Err(Error::InvalidData(format!(
72                    "dataset '{}' has unreadable CLASS attribute: {err}",
73                    ds.name()
74                )))
75            }
76            Err(_) => false,
77        },
78        Err(_) => false,
79    };
80
81    if is_dim_scale {
82        return Ok(None);
83    }
84
85    let nc_type = match hdf5_to_nc_type(ds.dtype()) {
86        Ok(t) => t,
87        Err(err) if strict => {
88            return Err(Error::InvalidData(format!(
89                "dataset '{}' uses unsupported NetCDF-4 type: {err}",
90                ds.name()
91            )))
92        }
93        Err(_) => return Ok(None),
94    };
95
96    let var_dims = resolve_variable_dimensions(ds, group, dimensions, dim_addr_map, metadata_mode)?;
97    let is_unlimited = var_dims.iter().any(|d| d.is_unlimited);
98    let shape = ds.shape();
99    let (data_size, record_size) =
100        compute_storage_sizes(shape, nc_type.size() as u64, is_unlimited)?;
101    let var_attrs = attributes::extract_variable_attributes(ds, metadata_mode)?;
102
103    Ok(Some(NcVariable {
104        name: leaf_name(ds.name()).to_string(),
105        dimensions: var_dims,
106        dtype: nc_type,
107        attributes: var_attrs,
108        data_offset: ds.address(),
109        _data_size: data_size,
110        is_record_var: is_unlimited,
111        record_size,
112    }))
113}
114
115/// Resolve variable dimensions via the `DIMENSION_LIST` attribute.
116///
117/// `DIMENSION_LIST` is a VLen-of-object-reference attribute. Each entry is a
118/// variable-length sequence of object references pointing to dimension-scale
119/// datasets. We parse the raw attribute data to extract these references.
120///
121fn resolve_variable_dimensions(
122    ds: &hdf5_reader::Dataset,
123    group: &Group,
124    dimensions: &[NcDimension],
125    dim_addr_map: &HashMap<u64, NcDimension>,
126    metadata_mode: crate::NcMetadataMode,
127) -> Result<Vec<NcDimension>> {
128    resolve_variable_dimensions_with_mode(ds, group, dimensions, dim_addr_map, metadata_mode)
129}
130
131fn resolve_variable_dimensions_with_mode(
132    ds: &hdf5_reader::Dataset,
133    group: &Group,
134    dimensions: &[NcDimension],
135    dim_addr_map: &HashMap<u64, NcDimension>,
136    metadata_mode: crate::NcMetadataMode,
137) -> Result<Vec<NcDimension>> {
138    match resolve_variable_dimensions_from_dimlist(ds, group, dim_addr_map) {
139        Ok(dims) => Ok(dims),
140        Err(_err) if metadata_mode == crate::NcMetadataMode::Lossy => {
141            Ok(resolve_variable_dimensions_by_size(ds, dimensions))
142        }
143        Err(err) => Err(err),
144    }
145}
146
147/// Resolve variable dimensions via the `DIMENSION_LIST` attribute.
148fn resolve_variable_dimensions_from_dimlist(
149    ds: &hdf5_reader::Dataset,
150    group: &Group,
151    dim_addr_map: &HashMap<u64, NcDimension>,
152) -> Result<Vec<NcDimension>> {
153    let dim_addrs = resolve_dimension_scale_addresses(ds, group)?;
154    let mut var_dims = Vec::with_capacity(dim_addrs.len());
155    for dim_addr in dim_addrs {
156        if let Some(dim) = dim_addr_map.get(&dim_addr) {
157            var_dims.push(dim.clone());
158        } else {
159            return Err(Error::InvalidData(format!(
160                "dataset '{}' references unknown dimension scale address {dim_addr:#x}",
161                ds.name()
162            )));
163        }
164    }
165
166    // Apply unlimited status from the dataset's max_dims.
167    if let Some(max_dims) = ds.max_dims() {
168        for (i, md) in max_dims.iter().enumerate() {
169            if *md == u64::MAX && i < var_dims.len() {
170                var_dims[i].is_unlimited = true;
171            }
172        }
173    }
174
175    Ok(var_dims)
176}
177
178pub(crate) fn resolve_dimension_scale_addresses(
179    ds: &hdf5_reader::Dataset,
180    group: &Group,
181) -> Result<Vec<u64>> {
182    let attr = ds.attribute("DIMENSION_LIST").map_err(|_| {
183        Error::InvalidData(format!(
184            "dataset '{}' is missing required DIMENSION_LIST metadata",
185            ds.name()
186        ))
187    })?;
188    let raw_data = &attr.raw_data;
189    let ndim = ds.ndim();
190    let offset_size = group.offset_size();
191
192    if ndim == 0 {
193        return Ok(Vec::new());
194    }
195    if raw_data.is_empty() {
196        return Err(Error::InvalidData(format!(
197            "dataset '{}' has empty DIMENSION_LIST metadata",
198            ds.name()
199        )));
200    }
201
202    let entry_size = 4 + usize::from(offset_size) + 4;
203    if raw_data.len() < ndim * entry_size {
204        return Err(Error::InvalidData(format!(
205            "dataset '{}' has truncated DIMENSION_LIST metadata",
206            ds.name()
207        )));
208    }
209
210    let mut dim_addrs = Vec::with_capacity(ndim);
211    let mut cursor = hdf5_reader::io::Cursor::new(raw_data);
212
213    for _ in 0..ndim {
214        let seq_len = cursor.read_u32_le().map_err(|err| {
215            Error::InvalidData(format!(
216                "dataset '{}' has invalid DIMENSION_LIST entry count: {err}",
217                ds.name()
218            ))
219        })? as usize;
220        let heap_addr = cursor.read_offset(offset_size).map_err(|err| {
221            Error::InvalidData(format!(
222                "dataset '{}' has invalid DIMENSION_LIST heap address: {err}",
223                ds.name()
224            ))
225        })?;
226        let heap_idx = cursor.read_u32_le().map_err(|err| {
227            Error::InvalidData(format!(
228                "dataset '{}' has invalid DIMENSION_LIST heap index: {err}",
229                ds.name()
230            ))
231        })? as u16;
232
233        if seq_len == 0 || hdf5_reader::io::Cursor::is_undefined_offset(heap_addr, offset_size) {
234            return Err(Error::InvalidData(format!(
235                "dataset '{}' has an unresolved DIMENSION_LIST reference",
236                ds.name()
237            )));
238        }
239
240        let collection = hdf5_reader::global_heap::GlobalHeapCollection::parse_at_storage(
241            group.storage(),
242            heap_addr,
243            offset_size,
244            group.length_size(),
245        )
246        .map_err(|err| {
247            Error::InvalidData(format!(
248                "dataset '{}' has unreadable DIMENSION_LIST heap object: {err}",
249                ds.name()
250            ))
251        })?;
252
253        let heap_obj = collection.get_object(heap_idx).ok_or_else(|| {
254            Error::InvalidData(format!(
255                "dataset '{}' references missing DIMENSION_LIST heap object {}",
256                ds.name(),
257                heap_idx
258            ))
259        })?;
260
261        let refs = hdf5_reader::reference::read_object_references(&heap_obj.data, offset_size)
262            .map_err(|err| {
263                Error::InvalidData(format!(
264                    "dataset '{}' has invalid DIMENSION_LIST references: {err}",
265                    ds.name()
266                ))
267            })?;
268
269        if refs.is_empty() {
270            return Err(Error::InvalidData(format!(
271                "dataset '{}' has empty DIMENSION_LIST references",
272                ds.name()
273            )));
274        }
275
276        dim_addrs.push(refs[0]);
277    }
278
279    Ok(dim_addrs)
280}
281
282fn compute_storage_sizes(shape: &[u64], elem_size: u64, is_unlimited: bool) -> Result<(u64, u64)> {
283    let total_elements =
284        crate::types::checked_shape_elements(shape, "NetCDF-4 variable element count")?;
285    let data_size = crate::types::checked_mul_u64(
286        total_elements,
287        elem_size,
288        "NetCDF-4 variable size in bytes",
289    )?;
290
291    let record_elements = if is_unlimited && shape.len() > 1 {
292        crate::types::checked_shape_elements(&shape[1..], "NetCDF-4 record element count")?
293    } else {
294        1
295    };
296    let record_size =
297        crate::types::checked_mul_u64(record_elements, elem_size, "NetCDF-4 record size in bytes")?;
298
299    Ok((data_size, record_size))
300}
301
302/// Resolve dimensions for a variable by matching its shape against the
303/// group's dimensions. Falls back to anonymous dimensions from the shape.
304fn resolve_variable_dimensions_by_size(
305    ds: &hdf5_reader::Dataset,
306    dimensions: &[NcDimension],
307) -> Vec<NcDimension> {
308    let shape = ds.shape();
309
310    // Try to match dimensions by size (simple heuristic when DIMENSION_LIST
311    // isn't available or parseable). This matches dims in order.
312    let mut var_dims = Vec::with_capacity(shape.len());
313    let mut used = vec![false; dimensions.len()];
314
315    for &dim_size in shape {
316        let mut matched = false;
317        for (i, dim) in dimensions.iter().enumerate() {
318            if !used[i] && dim.size == dim_size {
319                var_dims.push(dim.clone());
320                used[i] = true;
321                matched = true;
322                break;
323            }
324        }
325        if !matched {
326            // Create an anonymous dimension
327            var_dims.push(NcDimension {
328                name: format!("dim_{}", dim_size),
329                size: dim_size,
330                is_unlimited: false,
331            });
332        }
333    }
334
335    // Check if any matched dimension is unlimited and update dataspace info
336    if let Some(max_dims) = ds.max_dims() {
337        for (i, md) in max_dims.iter().enumerate() {
338            if *md == u64::MAX && i < var_dims.len() {
339                var_dims[i].is_unlimited = true;
340            }
341        }
342    }
343
344    var_dims
345}
346
347#[cfg(test)]
348mod tests {
349    use super::compute_storage_sizes;
350
351    #[test]
352    fn test_compute_storage_sizes_detects_overflow() {
353        let err = compute_storage_sizes(&[u64::MAX, 2], 8, false).unwrap_err();
354        assert!(matches!(err, crate::Error::InvalidData(_)));
355    }
356
357    #[test]
358    fn test_compute_storage_sizes_record_dims() {
359        let (data_size, record_size) = compute_storage_sizes(&[10, 3, 4], 4, true).unwrap();
360        assert_eq!(data_size, 480);
361        assert_eq!(record_size, 48);
362    }
363}