Skip to main content

netcdf_reader/nc4/
variables.rs

1//! Map HDF5 datasets to NetCDF-4 variables.
2//!
3//! Each user-visible HDF5 dataset becomes an NcVariable. Dimension-only scale
4//! datasets are internal metadata, but coordinate variables are also dimension
5//! scales and must remain visible. Non-scale variable dimensions are determined
6//! from the `DIMENSION_LIST` attribute, which contains object references to the
7//! corresponding dimension-scale datasets.
8
9use std::collections::HashMap;
10
11use hdf5_reader::group::Group;
12
13use crate::error::{Error, Result};
14use crate::types::{NcDimension, NcVariable};
15
16use super::attributes;
17use super::types::hdf5_to_nc_type;
18
19fn leaf_name(name: &str) -> &str {
20    name.rsplit('/').next().unwrap_or(name)
21}
22
23/// Extract variables from an HDF5 group.
24///
25/// Dimension-only scale datasets are internal metadata. Coordinate variables are
26/// also dimension scales, so they are preserved as NcVariables.
27///
28/// `dim_addr_map` maps dimension-scale dataset addresses to their `NcDimension`,
29/// used to resolve `DIMENSION_LIST` object references.
30pub fn extract_variables(
31    group: &Group,
32    dimensions: &[NcDimension],
33    dim_addr_map: &HashMap<u64, NcDimension>,
34    metadata_mode: crate::NcMetadataMode,
35) -> Result<Vec<NcVariable>> {
36    let datasets = group.datasets()?;
37    extract_variables_from_datasets(&datasets, group, dimensions, dim_addr_map, metadata_mode)
38}
39
40pub fn extract_variables_from_datasets(
41    datasets: &[hdf5_reader::Dataset],
42    group: &Group,
43    dimensions: &[NcDimension],
44    dim_addr_map: &HashMap<u64, NcDimension>,
45    metadata_mode: crate::NcMetadataMode,
46) -> Result<Vec<NcVariable>> {
47    let mut variables = Vec::new();
48
49    for ds in datasets {
50        if let Some(variable) =
51            extract_variable(ds, group, dimensions, dim_addr_map, metadata_mode)?
52        {
53            variables.push(variable);
54        }
55    }
56
57    Ok(variables)
58}
59
60pub fn extract_variable(
61    ds: &hdf5_reader::Dataset,
62    group: &Group,
63    dimensions: &[NcDimension],
64    dim_addr_map: &HashMap<u64, NcDimension>,
65    metadata_mode: crate::NcMetadataMode,
66) -> Result<Option<NcVariable>> {
67    let strict = metadata_mode == crate::NcMetadataMode::Strict;
68
69    let is_dim_scale = match ds.attribute("CLASS") {
70        Ok(attr) => match attr.read_string() {
71            Ok(value) => value == "DIMENSION_SCALE",
72            Err(err) if strict => {
73                return Err(Error::InvalidData(format!(
74                    "dataset '{}' has unreadable CLASS attribute: {err}",
75                    ds.name()
76                )))
77            }
78            Err(_) => false,
79        },
80        Err(_) => false,
81    };
82
83    if is_dim_scale && is_dimension_only_scale(ds, strict)? {
84        return Ok(None);
85    }
86
87    let nc_type = match hdf5_to_nc_type(ds.dtype()) {
88        Ok(t) => t,
89        Err(err) if strict => {
90            return Err(Error::InvalidData(format!(
91                "dataset '{}' uses unsupported NetCDF-4 type: {err}",
92                ds.name()
93            )))
94        }
95        Err(_) => return Ok(None),
96    };
97
98    let var_dims = if is_dim_scale {
99        resolve_coordinate_variable_dimensions(ds, dim_addr_map, metadata_mode)?
100    } else {
101        resolve_variable_dimensions(ds, group, dimensions, dim_addr_map, metadata_mode)?
102    };
103    let is_unlimited = var_dims.iter().any(|d| d.is_unlimited);
104    let shape = ds.shape();
105    let elem_size = u64::try_from(nc_type.size()?)
106        .map_err(|_| Error::InvalidData("NetCDF-4 type size exceeds u64 capacity".to_string()))?;
107    let (data_size, record_size) = compute_storage_sizes(shape, elem_size, is_unlimited)?;
108    let var_attrs = attributes::extract_variable_attributes(ds, metadata_mode)?;
109
110    Ok(Some(NcVariable {
111        name: leaf_name(ds.name()).to_string(),
112        dimensions: var_dims,
113        dtype: nc_type,
114        attributes: var_attrs,
115        data_offset: ds.address(),
116        _data_size: data_size,
117        is_record_var: is_unlimited,
118        record_size,
119    }))
120}
121
122fn is_dimension_only_scale(ds: &hdf5_reader::Dataset, strict: bool) -> Result<bool> {
123    match ds.attribute("NAME") {
124        Ok(attr) => match attr.read_string() {
125            Ok(value) => Ok(super::dimensions::is_dimension_without_variable_name(
126                &value,
127            )),
128            Err(err) if strict => Err(Error::InvalidData(format!(
129                "dimension scale '{}' has unreadable NAME attribute: {err}",
130                ds.name()
131            ))),
132            Err(_) => Ok(false),
133        },
134        Err(_) => Ok(false),
135    }
136}
137
138fn resolve_coordinate_variable_dimensions(
139    ds: &hdf5_reader::Dataset,
140    dim_addr_map: &HashMap<u64, NcDimension>,
141    metadata_mode: crate::NcMetadataMode,
142) -> Result<Vec<NcDimension>> {
143    if let Some(dim) = dim_addr_map.get(&ds.address()) {
144        Ok(vec![dim.clone()])
145    } else if metadata_mode == crate::NcMetadataMode::Lossy {
146        Ok(vec![NcDimension {
147            name: leaf_name(ds.name()).to_string(),
148            size: ds.shape().first().copied().unwrap_or(0),
149            is_unlimited: ds
150                .max_dims()
151                .is_some_and(|md| !md.is_empty() && md[0] == u64::MAX),
152        }])
153    } else {
154        Err(Error::InvalidData(format!(
155            "coordinate variable '{}' is not registered as a dimension scale",
156            ds.name()
157        )))
158    }
159}
160
161/// Resolve variable dimensions via the `DIMENSION_LIST` attribute.
162///
163/// `DIMENSION_LIST` is a VLen-of-object-reference attribute. Each entry is a
164/// variable-length sequence of object references pointing to dimension-scale
165/// datasets. We parse the raw attribute data to extract these references.
166///
167fn resolve_variable_dimensions(
168    ds: &hdf5_reader::Dataset,
169    group: &Group,
170    dimensions: &[NcDimension],
171    dim_addr_map: &HashMap<u64, NcDimension>,
172    metadata_mode: crate::NcMetadataMode,
173) -> Result<Vec<NcDimension>> {
174    resolve_variable_dimensions_with_mode(ds, group, dimensions, dim_addr_map, metadata_mode)
175}
176
177fn resolve_variable_dimensions_with_mode(
178    ds: &hdf5_reader::Dataset,
179    group: &Group,
180    dimensions: &[NcDimension],
181    dim_addr_map: &HashMap<u64, NcDimension>,
182    metadata_mode: crate::NcMetadataMode,
183) -> Result<Vec<NcDimension>> {
184    match resolve_variable_dimensions_from_dimlist(ds, group, dim_addr_map) {
185        Ok(dims) => Ok(dims),
186        Err(_err) if metadata_mode == crate::NcMetadataMode::Lossy => {
187            Ok(resolve_variable_dimensions_by_size(ds, dimensions))
188        }
189        Err(err) => Err(err),
190    }
191}
192
193/// Resolve variable dimensions via the `DIMENSION_LIST` attribute.
194fn resolve_variable_dimensions_from_dimlist(
195    ds: &hdf5_reader::Dataset,
196    group: &Group,
197    dim_addr_map: &HashMap<u64, NcDimension>,
198) -> Result<Vec<NcDimension>> {
199    let dim_addrs = resolve_dimension_scale_addresses(ds, group)?;
200    let mut var_dims = Vec::with_capacity(dim_addrs.len());
201    for dim_addr in dim_addrs {
202        if let Some(dim) = dim_addr_map.get(&dim_addr) {
203            var_dims.push(dim.clone());
204        } else {
205            return Err(Error::InvalidData(format!(
206                "dataset '{}' references unknown dimension scale address {dim_addr:#x}",
207                ds.name()
208            )));
209        }
210    }
211
212    // Apply unlimited status from the dataset's max_dims.
213    if let Some(max_dims) = ds.max_dims() {
214        for (i, md) in max_dims.iter().enumerate() {
215            if *md == u64::MAX && i < var_dims.len() {
216                var_dims[i].is_unlimited = true;
217            }
218        }
219    }
220
221    Ok(var_dims)
222}
223
224pub(crate) fn resolve_dimension_scale_addresses(
225    ds: &hdf5_reader::Dataset,
226    group: &Group,
227) -> Result<Vec<u64>> {
228    let attr = ds.attribute("DIMENSION_LIST").map_err(|_| {
229        Error::InvalidData(format!(
230            "dataset '{}' is missing required DIMENSION_LIST metadata",
231            ds.name()
232        ))
233    })?;
234    let raw_data = &attr.raw_data;
235    let ndim = ds.ndim();
236    let offset_size = group.offset_size();
237
238    if ndim == 0 {
239        return Ok(Vec::new());
240    }
241    if raw_data.is_empty() {
242        return Err(Error::InvalidData(format!(
243            "dataset '{}' has empty DIMENSION_LIST metadata",
244            ds.name()
245        )));
246    }
247
248    let entry_size = 4 + usize::from(offset_size) + 4;
249    if raw_data.len() < ndim * entry_size {
250        return Err(Error::InvalidData(format!(
251            "dataset '{}' has truncated DIMENSION_LIST metadata",
252            ds.name()
253        )));
254    }
255
256    let mut dim_addrs = Vec::with_capacity(ndim);
257    let mut cursor = hdf5_reader::io::Cursor::new(raw_data);
258
259    for _ in 0..ndim {
260        let seq_len = cursor.read_u32_le().map_err(|err| {
261            Error::InvalidData(format!(
262                "dataset '{}' has invalid DIMENSION_LIST entry count: {err}",
263                ds.name()
264            ))
265        })? as usize;
266        let heap_addr = cursor.read_offset(offset_size).map_err(|err| {
267            Error::InvalidData(format!(
268                "dataset '{}' has invalid DIMENSION_LIST heap address: {err}",
269                ds.name()
270            ))
271        })?;
272        let heap_idx = cursor.read_u32_le().map_err(|err| {
273            Error::InvalidData(format!(
274                "dataset '{}' has invalid DIMENSION_LIST heap index: {err}",
275                ds.name()
276            ))
277        })? as u16;
278
279        if seq_len == 0 || hdf5_reader::io::Cursor::is_undefined_offset(heap_addr, offset_size) {
280            return Err(Error::InvalidData(format!(
281                "dataset '{}' has an unresolved DIMENSION_LIST reference",
282                ds.name()
283            )));
284        }
285
286        let collection = hdf5_reader::global_heap::GlobalHeapCollection::parse_at_storage(
287            group.storage(),
288            heap_addr,
289            offset_size,
290            group.length_size(),
291        )
292        .map_err(|err| {
293            Error::InvalidData(format!(
294                "dataset '{}' has unreadable DIMENSION_LIST heap object: {err}",
295                ds.name()
296            ))
297        })?;
298
299        let heap_obj = collection.get_object(heap_idx).ok_or_else(|| {
300            Error::InvalidData(format!(
301                "dataset '{}' references missing DIMENSION_LIST heap object {}",
302                ds.name(),
303                heap_idx
304            ))
305        })?;
306
307        let refs = hdf5_reader::reference::read_object_references(&heap_obj.data, offset_size)
308            .map_err(|err| {
309                Error::InvalidData(format!(
310                    "dataset '{}' has invalid DIMENSION_LIST references: {err}",
311                    ds.name()
312                ))
313            })?;
314
315        if refs.is_empty() {
316            return Err(Error::InvalidData(format!(
317                "dataset '{}' has empty DIMENSION_LIST references",
318                ds.name()
319            )));
320        }
321
322        dim_addrs.push(refs[0]);
323    }
324
325    Ok(dim_addrs)
326}
327
328fn compute_storage_sizes(shape: &[u64], elem_size: u64, is_unlimited: bool) -> Result<(u64, u64)> {
329    let total_elements =
330        crate::types::checked_shape_elements(shape, "NetCDF-4 variable element count")?;
331    let data_size = crate::types::checked_mul_u64(
332        total_elements,
333        elem_size,
334        "NetCDF-4 variable size in bytes",
335    )?;
336
337    let record_elements = if is_unlimited && shape.len() > 1 {
338        crate::types::checked_shape_elements(&shape[1..], "NetCDF-4 record element count")?
339    } else {
340        1
341    };
342    let record_size =
343        crate::types::checked_mul_u64(record_elements, elem_size, "NetCDF-4 record size in bytes")?;
344
345    Ok((data_size, record_size))
346}
347
348/// Resolve dimensions for a variable by matching its shape against the
349/// group's dimensions. Falls back to anonymous dimensions from the shape.
350fn resolve_variable_dimensions_by_size(
351    ds: &hdf5_reader::Dataset,
352    dimensions: &[NcDimension],
353) -> Vec<NcDimension> {
354    let shape = ds.shape();
355
356    // Try to match dimensions by size (simple heuristic when DIMENSION_LIST
357    // isn't available or parseable). This matches dims in order.
358    let mut var_dims = Vec::with_capacity(shape.len());
359    let mut used = vec![false; dimensions.len()];
360
361    for &dim_size in shape {
362        let mut matched = false;
363        for (i, dim) in dimensions.iter().enumerate() {
364            if !used[i] && dim.size == dim_size {
365                var_dims.push(dim.clone());
366                used[i] = true;
367                matched = true;
368                break;
369            }
370        }
371        if !matched {
372            // Create an anonymous dimension
373            var_dims.push(NcDimension {
374                name: format!("dim_{}", dim_size),
375                size: dim_size,
376                is_unlimited: false,
377            });
378        }
379    }
380
381    // Check if any matched dimension is unlimited and update dataspace info
382    if let Some(max_dims) = ds.max_dims() {
383        for (i, md) in max_dims.iter().enumerate() {
384            if *md == u64::MAX && i < var_dims.len() {
385                var_dims[i].is_unlimited = true;
386            }
387        }
388    }
389
390    var_dims
391}
392
393#[cfg(test)]
394mod tests {
395    use super::compute_storage_sizes;
396
397    #[test]
398    fn compute_storage_sizes_detects_overflow() {
399        let err = compute_storage_sizes(&[u64::MAX, 2], 8, false).unwrap_err();
400        assert!(matches!(err, crate::Error::InvalidData(_)));
401    }
402
403    #[test]
404    fn compute_storage_sizes_record_dims() {
405        let (data_size, record_size) = compute_storage_sizes(&[10, 3, 4], 4, true).unwrap();
406        assert_eq!(data_size, 480);
407        assert_eq!(record_size, 48);
408    }
409}