Skip to main content

netcdf_reader/nc4/
variables.rs

1//! Map HDF5 datasets to NetCDF-4 variables.
2//!
3//! Each HDF5 dataset that is NOT a dimension scale becomes an NcVariable.
4//! The variable's dimensions are determined from the `DIMENSION_LIST` attribute,
5//! which contains object references to the corresponding dimension-scale datasets.
6
7use std::collections::HashMap;
8
9use hdf5_reader::group::Group;
10
11use crate::error::Result;
12use crate::types::{NcDimension, NcVariable};
13
14use super::attributes;
15use super::types::hdf5_to_nc_type;
16
17fn leaf_name(name: &str) -> &str {
18    name.rsplit('/').next().unwrap_or(name)
19}
20
21/// Extract variables from an HDF5 group.
22///
23/// Datasets with `CLASS=DIMENSION_SCALE` are dimensions, not variables.
24/// All other datasets become NcVariables.
25///
26/// `dim_addr_map` maps dimension-scale dataset addresses to their `NcDimension`,
27/// used to resolve `DIMENSION_LIST` object references.
28pub fn extract_variables(
29    group: &Group<'_>,
30    dimensions: &[NcDimension],
31    dim_addr_map: &HashMap<u64, NcDimension>,
32) -> Result<Vec<NcVariable>> {
33    let datasets = match group.datasets() {
34        Ok(ds) => ds,
35        Err(_) => return Ok(Vec::new()),
36    };
37    extract_variables_from_datasets(&datasets, group, dimensions, dim_addr_map)
38}
39
40pub fn extract_variables_from_datasets(
41    datasets: &[hdf5_reader::Dataset<'_>],
42    group: &Group<'_>,
43    dimensions: &[NcDimension],
44    dim_addr_map: &HashMap<u64, NcDimension>,
45) -> Result<Vec<NcVariable>> {
46    let mut variables = Vec::new();
47
48    for ds in datasets {
49        // Skip dimension scale datasets
50        let is_dim_scale = ds
51            .attribute("CLASS")
52            .ok()
53            .and_then(|attr| attr.read_string().ok())
54            .map(|s| s == "DIMENSION_SCALE")
55            .unwrap_or(false);
56
57        if is_dim_scale {
58            continue;
59        }
60
61        // Map the HDF5 datatype to a NetCDF type
62        let nc_type = match hdf5_to_nc_type(ds.dtype()) {
63            Ok(t) => t,
64            Err(_) => continue, // Skip datasets with unsupported types
65        };
66
67        // Resolve dimensions from DIMENSION_LIST attribute, falling back to size heuristic
68        let var_dims = resolve_variable_dimensions_from_dimlist(ds, group, dim_addr_map)
69            .unwrap_or_else(|| resolve_variable_dimensions_by_size(ds, dimensions));
70
71        // Detect if this variable uses an unlimited dimension
72        let is_unlimited = var_dims.iter().any(|d| d.is_unlimited);
73
74        let shape = ds.shape();
75        let (data_size, record_size) =
76            compute_storage_sizes(shape, nc_type.size() as u64, is_unlimited)?;
77
78        // Extract variable-level attributes
79        let var_attrs = attributes::extract_variable_attributes(ds)?;
80
81        variables.push(NcVariable {
82            name: leaf_name(ds.name()).to_string(),
83            dimensions: var_dims,
84            dtype: nc_type,
85            attributes: var_attrs,
86            data_offset: ds.address(),
87            _data_size: data_size,
88            is_record_var: is_unlimited,
89            record_size,
90        });
91    }
92
93    Ok(variables)
94}
95
96/// Resolve variable dimensions via the `DIMENSION_LIST` attribute.
97///
98/// `DIMENSION_LIST` is a VLen-of-object-reference attribute. Each entry is a
99/// variable-length sequence of object references pointing to dimension-scale
100/// datasets. We parse the raw attribute data to extract these references.
101///
102/// Returns `None` if the attribute is missing or unparseable.
103fn resolve_variable_dimensions_from_dimlist(
104    ds: &hdf5_reader::Dataset<'_>,
105    group: &Group<'_>,
106    dim_addr_map: &HashMap<u64, NcDimension>,
107) -> Option<Vec<NcDimension>> {
108    let attr = ds.attribute("DIMENSION_LIST").ok()?;
109    let raw_data = &attr.raw_data;
110    let ndim = ds.ndim();
111    let offset_size = group.offset_size();
112    let file_data = group.file_data();
113
114    if raw_data.is_empty() || ndim == 0 {
115        return None;
116    }
117
118    // Each vlen entry in the attribute data is:
119    //   seq_len: u32 (number of references in this vlen)
120    //   heap_addr: offset_size bytes (global heap collection address)
121    //   heap_idx: u32 (object index within the global heap collection)
122    let entry_size = 4 + offset_size as usize + 4;
123    if raw_data.len() < ndim * entry_size {
124        return None;
125    }
126
127    let mut var_dims = Vec::with_capacity(ndim);
128    let mut cursor = hdf5_reader::io::Cursor::new(raw_data);
129
130    for _ in 0..ndim {
131        let seq_len = cursor.read_u32_le().ok()? as usize;
132        let heap_addr = cursor.read_offset(offset_size).ok()?;
133        let heap_idx = cursor.read_u32_le().ok()? as u16;
134
135        if seq_len == 0 || hdf5_reader::io::Cursor::is_undefined_offset(heap_addr, offset_size) {
136            // No reference for this dimension — can't resolve.
137            return None;
138        }
139
140        // Parse the global heap collection at heap_addr.
141        let mut heap_cursor = hdf5_reader::io::Cursor::new(file_data);
142        heap_cursor.set_position(heap_addr);
143        let collection = hdf5_reader::global_heap::GlobalHeapCollection::parse(
144            &mut heap_cursor,
145            offset_size,
146            group.length_size(),
147        )
148        .ok()?;
149
150        let heap_obj = collection.get_object(heap_idx)?;
151
152        // The heap object data contains `seq_len` object references,
153        // each `offset_size` bytes.
154        let refs =
155            hdf5_reader::reference::read_object_references(&heap_obj.data, offset_size).ok()?;
156
157        if refs.is_empty() {
158            return None;
159        }
160
161        // Use the first reference (there's usually only one per dimension).
162        let dim_addr = refs[0];
163        if let Some(dim) = dim_addr_map.get(&dim_addr) {
164            var_dims.push(dim.clone());
165        } else {
166            // Reference points to unknown address — can't resolve.
167            return None;
168        }
169    }
170
171    // Apply unlimited status from the dataset's max_dims.
172    if let Some(max_dims) = ds.max_dims() {
173        for (i, md) in max_dims.iter().enumerate() {
174            if *md == u64::MAX && i < var_dims.len() {
175                var_dims[i].is_unlimited = true;
176            }
177        }
178    }
179
180    Some(var_dims)
181}
182
183fn compute_storage_sizes(shape: &[u64], elem_size: u64, is_unlimited: bool) -> Result<(u64, u64)> {
184    let total_elements =
185        crate::types::checked_shape_elements(shape, "NetCDF-4 variable element count")?;
186    let data_size = crate::types::checked_mul_u64(
187        total_elements,
188        elem_size,
189        "NetCDF-4 variable size in bytes",
190    )?;
191
192    let record_elements = if is_unlimited && shape.len() > 1 {
193        crate::types::checked_shape_elements(&shape[1..], "NetCDF-4 record element count")?
194    } else {
195        1
196    };
197    let record_size =
198        crate::types::checked_mul_u64(record_elements, elem_size, "NetCDF-4 record size in bytes")?;
199
200    Ok((data_size, record_size))
201}
202
203/// Resolve dimensions for a variable by matching its shape against the
204/// group's dimensions. Falls back to anonymous dimensions from the shape.
205fn resolve_variable_dimensions_by_size(
206    ds: &hdf5_reader::Dataset<'_>,
207    dimensions: &[NcDimension],
208) -> Vec<NcDimension> {
209    let shape = ds.shape();
210
211    // Try to match dimensions by size (simple heuristic when DIMENSION_LIST
212    // isn't available or parseable). This matches dims in order.
213    let mut var_dims = Vec::with_capacity(shape.len());
214    let mut used = vec![false; dimensions.len()];
215
216    for &dim_size in shape {
217        let mut matched = false;
218        for (i, dim) in dimensions.iter().enumerate() {
219            if !used[i] && dim.size == dim_size {
220                var_dims.push(dim.clone());
221                used[i] = true;
222                matched = true;
223                break;
224            }
225        }
226        if !matched {
227            // Create an anonymous dimension
228            var_dims.push(NcDimension {
229                name: format!("dim_{}", dim_size),
230                size: dim_size,
231                is_unlimited: false,
232            });
233        }
234    }
235
236    // Check if any matched dimension is unlimited and update dataspace info
237    if let Some(max_dims) = ds.max_dims() {
238        for (i, md) in max_dims.iter().enumerate() {
239            if *md == u64::MAX && i < var_dims.len() {
240                var_dims[i].is_unlimited = true;
241            }
242        }
243    }
244
245    var_dims
246}
247
248#[cfg(test)]
249mod tests {
250    use super::compute_storage_sizes;
251
252    #[test]
253    fn test_compute_storage_sizes_detects_overflow() {
254        let err = compute_storage_sizes(&[u64::MAX, 2], 8, false).unwrap_err();
255        assert!(matches!(err, crate::Error::InvalidData(_)));
256    }
257
258    #[test]
259    fn test_compute_storage_sizes_record_dims() {
260        let (data_size, record_size) = compute_storage_sizes(&[10, 3, 4], 4, true).unwrap();
261        assert_eq!(data_size, 480);
262        assert_eq!(record_size, 48);
263    }
264}