Skip to main content

netcdf_reader/nc4/
variables.rs

1//! Map HDF5 datasets to NetCDF-4 variables.
2//!
3//! Each user-visible HDF5 dataset becomes an NcVariable. Dimension-only scale
4//! datasets are internal metadata, but coordinate variables are also dimension
5//! scales and must remain visible. Non-scale variable dimensions are determined
6//! from the `DIMENSION_LIST` attribute, which contains object references to the
7//! corresponding dimension-scale datasets.
8
9use std::collections::HashMap;
10
11use hdf5_reader::group::Group;
12
13use crate::error::{Error, Result};
14use crate::types::{NcDimension, NcVariable};
15
16use super::attributes;
17use super::types::hdf5_to_nc_type;
18
19fn leaf_name(name: &str) -> &str {
20    name.rsplit('/').next().unwrap_or(name)
21}
22
23/// Extract variables from an HDF5 group.
24///
25/// Dimension-only scale datasets are internal metadata. Coordinate variables are
26/// also dimension scales, so they are preserved as NcVariables.
27///
28/// `dim_addr_map` maps dimension-scale dataset addresses to their `NcDimension`,
29/// used to resolve `DIMENSION_LIST` object references.
30pub fn extract_variables(
31    group: &Group,
32    dimensions: &[NcDimension],
33    dim_addr_map: &HashMap<u64, NcDimension>,
34    metadata_mode: crate::NcMetadataMode,
35) -> Result<Vec<NcVariable>> {
36    let datasets = group.datasets()?;
37    extract_variables_from_datasets(&datasets, group, dimensions, dim_addr_map, metadata_mode)
38}
39
40pub fn extract_variables_from_datasets(
41    datasets: &[hdf5_reader::Dataset],
42    group: &Group,
43    dimensions: &[NcDimension],
44    dim_addr_map: &HashMap<u64, NcDimension>,
45    metadata_mode: crate::NcMetadataMode,
46) -> Result<Vec<NcVariable>> {
47    let mut variables = Vec::new();
48
49    for ds in datasets {
50        if let Some(variable) =
51            extract_variable(ds, group, dimensions, dim_addr_map, metadata_mode)?
52        {
53            variables.push(variable);
54        }
55    }
56
57    Ok(variables)
58}
59
60pub fn extract_variable(
61    ds: &hdf5_reader::Dataset,
62    group: &Group,
63    dimensions: &[NcDimension],
64    dim_addr_map: &HashMap<u64, NcDimension>,
65    metadata_mode: crate::NcMetadataMode,
66) -> Result<Option<NcVariable>> {
67    let strict = metadata_mode == crate::NcMetadataMode::Strict;
68
69    let is_dim_scale = match ds.attribute("CLASS") {
70        Ok(attr) => match attr.read_string() {
71            Ok(value) => value == "DIMENSION_SCALE",
72            Err(err) if strict => {
73                return Err(Error::InvalidData(format!(
74                    "dataset '{}' has unreadable CLASS attribute: {err}",
75                    ds.name()
76                )))
77            }
78            Err(_) => false,
79        },
80        Err(_) => false,
81    };
82
83    if is_dim_scale && is_dimension_only_scale(ds, strict)? {
84        return Ok(None);
85    }
86
87    let nc_type = match hdf5_to_nc_type(ds.dtype()) {
88        Ok(t) => t,
89        Err(err) if strict => {
90            return Err(Error::InvalidData(format!(
91                "dataset '{}' uses unsupported NetCDF-4 type: {err}",
92                ds.name()
93            )))
94        }
95        Err(_) => return Ok(None),
96    };
97
98    let var_dims = if is_dim_scale {
99        resolve_coordinate_variable_dimensions(ds, dim_addr_map, metadata_mode)?
100    } else {
101        resolve_variable_dimensions(ds, group, dimensions, dim_addr_map, metadata_mode)?
102    };
103    let is_unlimited = var_dims.iter().any(|d| d.is_unlimited);
104    let shape = ds.shape();
105    let (data_size, record_size) =
106        compute_storage_sizes(shape, nc_type.size() as u64, is_unlimited)?;
107    let var_attrs = attributes::extract_variable_attributes(ds, metadata_mode)?;
108
109    Ok(Some(NcVariable {
110        name: leaf_name(ds.name()).to_string(),
111        dimensions: var_dims,
112        dtype: nc_type,
113        attributes: var_attrs,
114        data_offset: ds.address(),
115        _data_size: data_size,
116        is_record_var: is_unlimited,
117        record_size,
118    }))
119}
120
121fn is_dimension_only_scale(ds: &hdf5_reader::Dataset, strict: bool) -> Result<bool> {
122    match ds.attribute("NAME") {
123        Ok(attr) => match attr.read_string() {
124            Ok(value) => Ok(super::dimensions::is_dimension_without_variable_name(
125                &value,
126            )),
127            Err(err) if strict => Err(Error::InvalidData(format!(
128                "dimension scale '{}' has unreadable NAME attribute: {err}",
129                ds.name()
130            ))),
131            Err(_) => Ok(false),
132        },
133        Err(_) => Ok(false),
134    }
135}
136
137fn resolve_coordinate_variable_dimensions(
138    ds: &hdf5_reader::Dataset,
139    dim_addr_map: &HashMap<u64, NcDimension>,
140    metadata_mode: crate::NcMetadataMode,
141) -> Result<Vec<NcDimension>> {
142    if let Some(dim) = dim_addr_map.get(&ds.address()) {
143        Ok(vec![dim.clone()])
144    } else if metadata_mode == crate::NcMetadataMode::Lossy {
145        Ok(vec![NcDimension {
146            name: leaf_name(ds.name()).to_string(),
147            size: ds.shape().first().copied().unwrap_or(0),
148            is_unlimited: ds
149                .max_dims()
150                .is_some_and(|md| !md.is_empty() && md[0] == u64::MAX),
151        }])
152    } else {
153        Err(Error::InvalidData(format!(
154            "coordinate variable '{}' is not registered as a dimension scale",
155            ds.name()
156        )))
157    }
158}
159
160/// Resolve variable dimensions via the `DIMENSION_LIST` attribute.
161///
162/// `DIMENSION_LIST` is a VLen-of-object-reference attribute. Each entry is a
163/// variable-length sequence of object references pointing to dimension-scale
164/// datasets. We parse the raw attribute data to extract these references.
165///
166fn resolve_variable_dimensions(
167    ds: &hdf5_reader::Dataset,
168    group: &Group,
169    dimensions: &[NcDimension],
170    dim_addr_map: &HashMap<u64, NcDimension>,
171    metadata_mode: crate::NcMetadataMode,
172) -> Result<Vec<NcDimension>> {
173    resolve_variable_dimensions_with_mode(ds, group, dimensions, dim_addr_map, metadata_mode)
174}
175
176fn resolve_variable_dimensions_with_mode(
177    ds: &hdf5_reader::Dataset,
178    group: &Group,
179    dimensions: &[NcDimension],
180    dim_addr_map: &HashMap<u64, NcDimension>,
181    metadata_mode: crate::NcMetadataMode,
182) -> Result<Vec<NcDimension>> {
183    match resolve_variable_dimensions_from_dimlist(ds, group, dim_addr_map) {
184        Ok(dims) => Ok(dims),
185        Err(_err) if metadata_mode == crate::NcMetadataMode::Lossy => {
186            Ok(resolve_variable_dimensions_by_size(ds, dimensions))
187        }
188        Err(err) => Err(err),
189    }
190}
191
192/// Resolve variable dimensions via the `DIMENSION_LIST` attribute.
193fn resolve_variable_dimensions_from_dimlist(
194    ds: &hdf5_reader::Dataset,
195    group: &Group,
196    dim_addr_map: &HashMap<u64, NcDimension>,
197) -> Result<Vec<NcDimension>> {
198    let dim_addrs = resolve_dimension_scale_addresses(ds, group)?;
199    let mut var_dims = Vec::with_capacity(dim_addrs.len());
200    for dim_addr in dim_addrs {
201        if let Some(dim) = dim_addr_map.get(&dim_addr) {
202            var_dims.push(dim.clone());
203        } else {
204            return Err(Error::InvalidData(format!(
205                "dataset '{}' references unknown dimension scale address {dim_addr:#x}",
206                ds.name()
207            )));
208        }
209    }
210
211    // Apply unlimited status from the dataset's max_dims.
212    if let Some(max_dims) = ds.max_dims() {
213        for (i, md) in max_dims.iter().enumerate() {
214            if *md == u64::MAX && i < var_dims.len() {
215                var_dims[i].is_unlimited = true;
216            }
217        }
218    }
219
220    Ok(var_dims)
221}
222
223pub(crate) fn resolve_dimension_scale_addresses(
224    ds: &hdf5_reader::Dataset,
225    group: &Group,
226) -> Result<Vec<u64>> {
227    let attr = ds.attribute("DIMENSION_LIST").map_err(|_| {
228        Error::InvalidData(format!(
229            "dataset '{}' is missing required DIMENSION_LIST metadata",
230            ds.name()
231        ))
232    })?;
233    let raw_data = &attr.raw_data;
234    let ndim = ds.ndim();
235    let offset_size = group.offset_size();
236
237    if ndim == 0 {
238        return Ok(Vec::new());
239    }
240    if raw_data.is_empty() {
241        return Err(Error::InvalidData(format!(
242            "dataset '{}' has empty DIMENSION_LIST metadata",
243            ds.name()
244        )));
245    }
246
247    let entry_size = 4 + usize::from(offset_size) + 4;
248    if raw_data.len() < ndim * entry_size {
249        return Err(Error::InvalidData(format!(
250            "dataset '{}' has truncated DIMENSION_LIST metadata",
251            ds.name()
252        )));
253    }
254
255    let mut dim_addrs = Vec::with_capacity(ndim);
256    let mut cursor = hdf5_reader::io::Cursor::new(raw_data);
257
258    for _ in 0..ndim {
259        let seq_len = cursor.read_u32_le().map_err(|err| {
260            Error::InvalidData(format!(
261                "dataset '{}' has invalid DIMENSION_LIST entry count: {err}",
262                ds.name()
263            ))
264        })? as usize;
265        let heap_addr = cursor.read_offset(offset_size).map_err(|err| {
266            Error::InvalidData(format!(
267                "dataset '{}' has invalid DIMENSION_LIST heap address: {err}",
268                ds.name()
269            ))
270        })?;
271        let heap_idx = cursor.read_u32_le().map_err(|err| {
272            Error::InvalidData(format!(
273                "dataset '{}' has invalid DIMENSION_LIST heap index: {err}",
274                ds.name()
275            ))
276        })? as u16;
277
278        if seq_len == 0 || hdf5_reader::io::Cursor::is_undefined_offset(heap_addr, offset_size) {
279            return Err(Error::InvalidData(format!(
280                "dataset '{}' has an unresolved DIMENSION_LIST reference",
281                ds.name()
282            )));
283        }
284
285        let collection = hdf5_reader::global_heap::GlobalHeapCollection::parse_at_storage(
286            group.storage(),
287            heap_addr,
288            offset_size,
289            group.length_size(),
290        )
291        .map_err(|err| {
292            Error::InvalidData(format!(
293                "dataset '{}' has unreadable DIMENSION_LIST heap object: {err}",
294                ds.name()
295            ))
296        })?;
297
298        let heap_obj = collection.get_object(heap_idx).ok_or_else(|| {
299            Error::InvalidData(format!(
300                "dataset '{}' references missing DIMENSION_LIST heap object {}",
301                ds.name(),
302                heap_idx
303            ))
304        })?;
305
306        let refs = hdf5_reader::reference::read_object_references(&heap_obj.data, offset_size)
307            .map_err(|err| {
308                Error::InvalidData(format!(
309                    "dataset '{}' has invalid DIMENSION_LIST references: {err}",
310                    ds.name()
311                ))
312            })?;
313
314        if refs.is_empty() {
315            return Err(Error::InvalidData(format!(
316                "dataset '{}' has empty DIMENSION_LIST references",
317                ds.name()
318            )));
319        }
320
321        dim_addrs.push(refs[0]);
322    }
323
324    Ok(dim_addrs)
325}
326
327fn compute_storage_sizes(shape: &[u64], elem_size: u64, is_unlimited: bool) -> Result<(u64, u64)> {
328    let total_elements =
329        crate::types::checked_shape_elements(shape, "NetCDF-4 variable element count")?;
330    let data_size = crate::types::checked_mul_u64(
331        total_elements,
332        elem_size,
333        "NetCDF-4 variable size in bytes",
334    )?;
335
336    let record_elements = if is_unlimited && shape.len() > 1 {
337        crate::types::checked_shape_elements(&shape[1..], "NetCDF-4 record element count")?
338    } else {
339        1
340    };
341    let record_size =
342        crate::types::checked_mul_u64(record_elements, elem_size, "NetCDF-4 record size in bytes")?;
343
344    Ok((data_size, record_size))
345}
346
347/// Resolve dimensions for a variable by matching its shape against the
348/// group's dimensions. Falls back to anonymous dimensions from the shape.
349fn resolve_variable_dimensions_by_size(
350    ds: &hdf5_reader::Dataset,
351    dimensions: &[NcDimension],
352) -> Vec<NcDimension> {
353    let shape = ds.shape();
354
355    // Try to match dimensions by size (simple heuristic when DIMENSION_LIST
356    // isn't available or parseable). This matches dims in order.
357    let mut var_dims = Vec::with_capacity(shape.len());
358    let mut used = vec![false; dimensions.len()];
359
360    for &dim_size in shape {
361        let mut matched = false;
362        for (i, dim) in dimensions.iter().enumerate() {
363            if !used[i] && dim.size == dim_size {
364                var_dims.push(dim.clone());
365                used[i] = true;
366                matched = true;
367                break;
368            }
369        }
370        if !matched {
371            // Create an anonymous dimension
372            var_dims.push(NcDimension {
373                name: format!("dim_{}", dim_size),
374                size: dim_size,
375                is_unlimited: false,
376            });
377        }
378    }
379
380    // Check if any matched dimension is unlimited and update dataspace info
381    if let Some(max_dims) = ds.max_dims() {
382        for (i, md) in max_dims.iter().enumerate() {
383            if *md == u64::MAX && i < var_dims.len() {
384                var_dims[i].is_unlimited = true;
385            }
386        }
387    }
388
389    var_dims
390}
391
392#[cfg(test)]
393mod tests {
394    use super::compute_storage_sizes;
395
396    #[test]
397    fn test_compute_storage_sizes_detects_overflow() {
398        let err = compute_storage_sizes(&[u64::MAX, 2], 8, false).unwrap_err();
399        assert!(matches!(err, crate::Error::InvalidData(_)));
400    }
401
402    #[test]
403    fn test_compute_storage_sizes_record_dims() {
404        let (data_size, record_size) = compute_storage_sizes(&[10, 3, 4], 4, true).unwrap();
405        assert_eq!(data_size, 480);
406        assert_eq!(record_size, 48);
407    }
408}