gistools/readers/netcdf/
mod.rs

1use crate::parsers::{FeatureReader, Reader};
2use alloc::{boxed::Box, collections::BTreeMap, string::String, vec, vec::Vec};
3use core::cell::RefCell;
4use s2json::{MValue, Properties, ValueType, VectorFeature, VectorGeometry, VectorPoint};
5
6// TODO: I don't know why anymore but the FileReader is SLOOOOOW, I think it re-reads over and over.
7// But the BufferReader is fast. 99% of nc files are small so this is probably not worth the effort
8// to fix anytime soon
9
10/// The kind of data that can be stored in a NetCDF file
11#[derive(Debug, Clone, PartialEq)]
12pub enum CDFValue {
13    /// String case
14    String(String),
15    /// Number case
16    Number(f64),
17    /// Array of numbers case
18    Array(Vec<f64>),
19}
20impl Default for CDFValue {
21    fn default() -> Self {
22        CDFValue::Number(0.0)
23    }
24}
25impl CDFValue {
26    /// Converts a CDFValue to a number
27    pub fn to_num(&self) -> f64 {
28        match self {
29            CDFValue::Number(n) => *n,
30            _ => 0.0,
31        }
32    }
33    /// Get the number in the array at a given index
34    pub fn get_index(&self, index: u64) -> f64 {
35        match self {
36            CDFValue::Array(v) => v[index as usize],
37            _ => 0.0,
38        }
39    }
40}
41impl From<&CDFValue> for ValueType {
42    fn from(value: &CDFValue) -> Self {
43        match value {
44            CDFValue::String(s) => s.into(),
45            CDFValue::Number(n) => (*n).into(),
46            CDFValue::Array(v) => v.clone().into(),
47        }
48    }
49}
50impl From<&str> for CDFValue {
51    fn from(value: &str) -> Self {
52        CDFValue::String(value.into())
53    }
54}
55impl From<f64> for CDFValue {
56    fn from(value: f64) -> Self {
57        CDFValue::Number(value)
58    }
59}
60impl From<Vec<f64>> for CDFValue {
61    fn from(value: Vec<f64>) -> Self {
62        CDFValue::Array(value)
63    }
64}
65
66/// An NetCDF Shaped Vector Feature
67pub type CDFVectorFeature = VectorFeature<(), Properties, MValue>;
68
69/// The kind of attributes that can be stored in a NetCDF file. Similar to a GeoJSON Properties object
70pub type CDFAttributes = BTreeMap<String, CDFValue>;
71
72/// Track the dimension and its max value (can be infinity)
73#[derive(Debug, Default, Clone, PartialEq)]
74pub struct CDFDimension {
75    /// index of the dimension
76    pub index: u64,
77    /// name of the dimension
78    pub name: String,
79    /// size of the dimension
80    pub size: u64,
81}
82
83/// Track information about the dimensions, which is "unlimited" dimension, and variable sizes
84#[derive(Debug, Default, Clone, PartialEq)]
85pub struct CDFRecordDimension {
86    /// Length of the record dimension sum of the var_size's of all the record variables
87    pub size: u64,
88    /// ID of the record dimension
89    pub id: Option<u64>,
90    /// Name of the record dimension
91    pub name: Option<String>,
92    /// Step of the record dimension
93    pub record_step: Option<u64>,
94}
95
96/// A NetCDF variable
97#[derive(Debug, Default, Clone, PartialEq)]
98pub struct CDFVariable {
99    /// name of the variable
100    pub name: String,
101    /// Array with the dimension IDs of the variable
102    pub dimensions: Vec<CDFDimension>,
103    /// Array with the attributes of the variable
104    pub attributes: CDFAttributes,
105    /// type of the variable
106    pub r#type: CDFDataType,
107    /// size of the variable
108    pub size: u64,
109    /// offset where of the variable begins
110    pub offset: u64,
111    /// True if is a record variable, false otherwise (unlimited size)
112    pub record: bool,
113}
114
115// Grammar constants
116const NC_UNLIMITED: u64 = 0;
117const NC_DIMENSION: u64 = 10;
118const NC_VARIABLE: u64 = 11;
119const NC_ATTRIBUTE: u64 = 12;
120
121/// Enum of the NetCDF data types available
122#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
123pub enum CDFDataType {
124    /// Byte size (1 byte)
125    #[default]
126    BYTE = 1,
127    /// Char size (1 byte)
128    CHAR = 2,
129    /// Short size (2 bytes)
130    SHORT = 3,
131    /// Integer size (4 bytes)
132    INT = 4,
133    /// Float size (4 bytes)
134    FLOAT = 5,
135    /// Double size (8 bytes)
136    DOUBLE = 6,
137}
138impl From<u64> for CDFDataType {
139    fn from(value: u64) -> Self {
140        match value {
141            2 => CDFDataType::CHAR,
142            3 => CDFDataType::SHORT,
143            4 => CDFDataType::INT,
144            5 => CDFDataType::FLOAT,
145            6 => CDFDataType::DOUBLE,
146            _ => CDFDataType::BYTE,
147        }
148    }
149}
150
151/// Given a type, get the number of bytes it represents
152///
153/// ## Parameters
154/// - `type`: the NetCDF data type
155///
156/// ## Returns
157/// The number of bytes for the data type
158pub fn netcdf_type_to_bytes(r#type: CDFDataType) -> u64 {
159    match r#type {
160        CDFDataType::BYTE | CDFDataType::CHAR => 1,
161        CDFDataType::SHORT => 2,
162        CDFDataType::INT | CDFDataType::FLOAT => 4,
163        CDFDataType::DOUBLE => 8,
164    }
165}
166
167/// User defined options on how to parse the NetCDF file
168#[derive(Debug, Default, Clone, PartialEq)]
169pub struct NetCDFReaderOptions {
170    /// If provided the lookup of the longitude [Default='lon']
171    pub lon_key: Option<String>,
172    /// If provided the lookup of the latitude [Default='lat']
173    pub lat_key: Option<String>,
174    /// If provided the lookup for the height value [Default=undefined]
175    pub height_key: Option<String>,
176    /// List of fields to include in the feature properties
177    pub prop_fields: Option<Vec<String>>,
178}
179
180/// # NetCDF v3.x Reader
181///
182/// ## Description
183/// Read the NetCDF v3.x file format
184///
185/// [See specification](https://www.unidata.ucar.edu/software/netcdf/docs/file_format_specifications.html)
186///
187/// Implements the [`FeatureReader`] trait
188///
189/// ## Usage
190///
191/// The methods you have access to:
192/// - [`NetCDFReader::new`]: Create a new NetCDFReader
193/// - [`NetCDFReader::len`]: Returns the number of records
194/// - [`NetCDFReader::is_empty`]: Returns true if the reader is empty
195/// - [`NetCDFReader::get_properties`]: Returns the properties for a given index
196/// - [`NetCDFReader::get_point`]: Get the point at a given index
197/// - [`NetCDFReader::get_feature`]: Reads a point in at index as a feature
198/// - [`NetCDFReader::get_data_variable`]: Retrieves the data for a given variable
199/// - [`NetCDFReader::iter`]: Create an iterator over the features
200/// - [`NetCDFReader::par_iter`]: Create a parallel iterator over the features
201///
202/// ### Buffer Reader
203/// ```rust
204/// use gistools::{
205///     parsers::{FeatureReader, BufferReader},
206///     readers::{NetCDFReader, NetCDFReaderOptions},
207/// };
208///
209/// // Ignore this, used to setup example
210/// use std::path::PathBuf;
211/// let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
212/// path.push("tests/readers/netcdf/fixtures/ichthyop.nc");
213/// let data: Vec<u8> = std::fs::read(path).unwrap();
214///
215/// let netcdf_reader = NetCDFReader::new(
216///     BufferReader::from(data),
217///     Some(NetCDFReaderOptions {
218///         lon_key: Some("lon".into()),
219///         lat_key: Some("lat".into()),
220///         height_key: Some("depth".into()),
221///         prop_fields: Some(vec!["depth".into()]),
222///     }),
223/// );
224/// assert_eq!(netcdf_reader.len(), 49);
225/// let features: Vec<_> = netcdf_reader.iter().collect();
226/// ```
227///
228/// ## Links
229/// - <https://www.unidata.ucar.edu/software/netcdf/docs/file_format_specifications.html>
230#[derive(Debug, Clone)]
231pub struct NetCDFReader<T: Reader> {
232    reader: T,
233    /// Record dimension
234    pub record_dimension: CDFRecordDimension,
235    /// List of dimensions
236    pub dimensions: Vec<CDFDimension>,
237    /// List of global attributes
238    pub global_attributes: CDFAttributes,
239    /// List of variables
240    pub variables: Vec<CDFVariable>,
241    /// Describes if offsets are 32 or 64 bits
242    pub is64: bool,
243    /// Track the cursor for parsing the header
244    cursor: RefCell<u64>,
245    lon_key: String,
246    lat_key: String,
247    height_key: Option<String>,
248    prop_fields: Vec<String>,
249}
250impl<T: Reader> NetCDFReader<T> {
251    /// Creates a new NetCDF reader
252    pub fn new(reader: T, options: Option<NetCDFReaderOptions>) -> Self {
253        // Validate that it's a NetCDF file
254        let magic = reader.parse_string(Some(0), Some(3));
255        if &magic != "CDF" {
256            panic!("Not a valid NetCDF file: should start with CDF");
257        }
258        // Check the NetCDF format
259        let is64 = reader.uint8(Some(3)) != 1;
260        let options = options.unwrap_or_default();
261        let mut cdf_reader = NetCDFReader {
262            reader,
263            record_dimension: CDFRecordDimension {
264                size: 0,
265                id: None,
266                name: None,
267                record_step: None,
268            },
269            dimensions: vec![],
270            global_attributes: BTreeMap::new(),
271            variables: vec![],
272            is64,
273            cursor: 4.into(),
274            lon_key: options.lon_key.unwrap_or("lon".into()),
275            lat_key: options.lat_key.unwrap_or("lat".into()),
276            height_key: options.height_key,
277            prop_fields: options.prop_fields.unwrap_or_default(),
278        };
279        // Read the header
280        cdf_reader.parse_header();
281
282        cdf_reader
283    }
284
285    /// Returns the number of records
286    pub fn len(&self) -> u64 {
287        let lat = self.get_data_variable(self.lat_key.clone());
288        if let Some(lat) = lat {
289            return lat.len() as u64;
290        }
291        0
292    }
293
294    /// Check if the reader is empty
295    pub fn is_empty(&self) -> bool {
296        self.len() == 0
297    }
298
299    /// Get the properties at a given index
300    pub fn get_properties(&self, index: u64) -> Option<MValue> {
301        let mut m = MValue::new();
302        for field in self.prop_fields.clone().into_iter() {
303            let value = self.get_data_variable(field.clone());
304            if let Some(value) = value {
305                let field: String = field.clone();
306                let value: ValueType = (&value[index as usize]).into();
307                m.insert(field, value);
308            }
309        }
310        Some(m)
311    }
312
313    /// Get the point at a given index
314    pub fn get_point(&self, index: u64) -> Option<VectorPoint<MValue>> {
315        if index >= self.len() {
316            return None;
317        }
318        let lat = self.get_data_variable(self.lat_key.clone());
319        let lon = self.get_data_variable(self.lon_key.clone());
320        let height = self.get_data_variable(self.height_key.clone().unwrap_or_default());
321        if let (Some(lat), Some(lon)) = (lat, lon) {
322            let lat = lat[0].get_index(index);
323            let lon = lon[0].get_index(index);
324            let m = self.get_properties(index);
325            return Some(VectorPoint::new(lon, lat, height.map(|h| h[index as usize].to_num()), m));
326        }
327        None
328    }
329
330    /// Reads a point in at index as a feature
331    pub fn get_feature(&self, index: u64) -> Option<CDFVectorFeature> {
332        self.get_point(index).map(|point| {
333            VectorFeature::new_wm(
334                None,
335                Properties::default(),
336                VectorGeometry::new_point(point, None),
337                None,
338            )
339        })
340    }
341
342    /// Retrieves the data for a given variable
343    ///
344    /// ## Parameters
345    /// - `variable_name`: Name of the variable to search or variable object
346    ///
347    /// ## Returns
348    /// The variable values
349    pub fn get_data_variable(&self, variable_name: String) -> Option<Vec<CDFValue>> {
350        let variable = self.variables.iter().find(|val| val.name == variable_name).cloned();
351        // return nothing if not found
352        if let Some(variable) = variable {
353            // go to the offset position
354            *self.cursor.borrow_mut() = variable.offset;
355            // return the data
356            return if variable.record {
357                Some(self.get_record(variable))
358            } else {
359                Some(self.get_non_record(variable))
360            };
361        }
362        None
363    }
364
365    // INTERNAL
366
367    /// Internal method to get the current offset
368    ///
369    /// ## Returns
370    /// The current offset
371    fn get_offset(&self) -> u64 {
372        if self.is64 { self.get_u64() } else { self.get_u32() }
373    }
374
375    /// Internal method to get a 32 but value under the cursor
376    ///
377    /// ## Returns
378    /// A 32 bit value
379    fn get_u32(&self) -> u64 {
380        let data = self.reader.uint32_be(Some(*self.cursor.borrow()));
381        *self.cursor.borrow_mut() += 4;
382        data as u64
383    }
384
385    /// Internal method to get a 64 but value under the cursor
386    ///
387    /// ## Returns
388    /// A 64 bit value
389    fn get_u64(&self) -> u64 {
390        let data = self.reader.uint64_be(Some(*self.cursor.borrow()));
391        *self.cursor.borrow_mut() += 8;
392        data
393    }
394
395    /// Internal method to read a string under the cursor
396    ///
397    /// ## Returns
398    /// A string of the name
399    fn get_name(&self) -> String {
400        let name_length = self.get_u32();
401        let name = self.reader.parse_string(Some(*self.cursor.borrow()), Some(name_length));
402        *self.cursor.borrow_mut() += name_length;
403        self.padding();
404
405        name.trim().into()
406    }
407
408    /// Internal method to Parse the header
409    fn parse_header(&mut self) {
410        // build dimension list
411        self.record_dimension.size = self.get_u32();
412        self.build_dimension_list();
413        // build global attributes
414        self.global_attributes = self.build_attributes();
415        // build the variable list
416        self.build_variables_list();
417    }
418
419    /// Get the data type
420    ///
421    /// ## Parameters
422    /// - `type`: the data type
423    /// - `size`: the data size
424    ///
425    /// ## Returns
426    /// The data type
427    fn get_type(&self, r#type: CDFDataType, size: u64) -> CDFValue {
428        let data = if r#type == CDFDataType::BYTE {
429            let mut res = vec![];
430            let mut i = 0;
431            while i < size {
432                res.push(self.reader.uint8(Some(*self.cursor.borrow())) as f64);
433                *self.cursor.borrow_mut() += 1;
434                i += 1;
435            }
436            CDFValue::Array(res)
437        } else if r#type == CDFDataType::CHAR {
438            let res = self.reader.parse_string(Some(*self.cursor.borrow()), Some(size));
439            *self.cursor.borrow_mut() += size;
440            CDFValue::String(res.trim().into())
441        } else if r#type == CDFDataType::SHORT
442            || r#type == CDFDataType::INT
443            || r#type == CDFDataType::FLOAT
444            || r#type == CDFDataType::DOUBLE
445        {
446            let step = if r#type == CDFDataType::DOUBLE {
447                8
448            } else if r#type == CDFDataType::SHORT {
449                2
450            } else {
451                4
452            };
453            let read_number: Box<dyn Fn(u64) -> f64> = if r#type == CDFDataType::SHORT {
454                Box::new(|offset: u64| self.reader.int16_be(Some(offset)) as f64)
455            } else if r#type == CDFDataType::INT {
456                Box::new(|offset: u64| self.reader.int32_be(Some(offset)) as f64)
457            } else if r#type == CDFDataType::FLOAT {
458                Box::new(|offset: u64| self.reader.f32_be(Some(offset)) as f64)
459            } else {
460                Box::new(|offset: u64| self.reader.f64_be(Some(offset)))
461            };
462            let mut res = vec![];
463            let mut i = 0;
464            while i < size {
465                res.push(read_number(*self.cursor.borrow()));
466                *self.cursor.borrow_mut() += step;
467                i += 1;
468            }
469            if res.len() == 1 { CDFValue::Number(res[0]) } else { CDFValue::Array(res) }
470        } else {
471            panic!("non valid type {:?}", r#type);
472        };
473
474        self.padding();
475
476        data
477    }
478
479    /// Internal method to build the dimension list
480    fn build_dimension_list(&mut self) {
481        let dim_list_tag = self.get_u32();
482
483        if dim_list_tag == 0 {
484            let ensure_empty = self.get_u32();
485            if ensure_empty != 0 {
486                panic!("wrong empty tag for list of dimensions");
487            }
488        } else {
489            if dim_list_tag != NC_DIMENSION {
490                panic!("wrong tag for list of dimensions");
491            }
492
493            // Length of dimensions
494            let dimension_size = self.get_u32();
495            // populate `name` and `size` for each dimension
496            let mut index = 0;
497            while index < dimension_size {
498                // Read name
499                let name = self.get_name();
500                // Read dimension size
501                let size = self.get_u32();
502                if size == NC_UNLIMITED {
503                    // in netcdf 3 one field can be of size unlimited
504                    self.record_dimension.id = Some(index);
505                    self.record_dimension.name = Some(name.clone());
506                }
507                // store the dimension
508                self.dimensions.push(CDFDimension { index, name, size });
509
510                index += 1;
511            }
512        }
513    }
514
515    /// Internal method to build attributes including global attributes
516    ///
517    /// ## Returns
518    /// Attributes from a block of data at a given offset
519    fn build_attributes(&mut self) -> CDFAttributes {
520        let mut atrributes = CDFAttributes::default();
521        let g_att_tag = self.get_u32();
522        if g_att_tag == 0 {
523            let ensure_empty = self.get_u32();
524            if ensure_empty != 0 {
525                panic!("wrong empty tag for list of attributes");
526            }
527        } else {
528            if g_att_tag != NC_ATTRIBUTE {
529                panic!("wrong tag for list of attributes");
530            }
531            // Length of attributes
532            let attribute_size = self.get_u32();
533            // Populate `name`, `type` and `value` for each attribute
534            let mut ga_idx = 0;
535            while ga_idx < attribute_size {
536                // Read name, type, and size of data block
537                let name = self.get_name();
538                let r#type: CDFDataType = self.get_u32().into();
539                let size = self.get_u32();
540                // store the attribute key-value
541                let data = self.get_type(r#type, size);
542                atrributes.insert(name, data);
543                ga_idx += 1;
544            }
545        }
546
547        atrributes
548    }
549
550    /// Internal method to build a variable list from a block of data at a given offset
551    fn build_variables_list(&mut self) {
552        let var_tag = self.get_u32();
553        let mut record_step = 0;
554        if var_tag == 0 {
555            let ensure_empty = self.get_u32();
556            if ensure_empty != 0 {
557                panic!("wrong empty tag for list of variables");
558            }
559        } else {
560            if var_tag != NC_VARIABLE {
561                panic!("wrong tag for list of variables");
562            }
563            // Length of variables
564            let var_size = self.get_u32();
565            let mut v_idx = 0;
566            while v_idx < var_size {
567                // Read name, dimensionality, and index into the list of dimensions
568                let name = self.get_name();
569                let dimensionality = self.get_u32();
570                let mut dimensions_ids = vec![];
571                let mut dim = 0;
572                while dim < dimensionality {
573                    dimensions_ids.push(self.get_u32());
574                    dim += 1;
575                }
576                // Read variables size
577                let attributes = self.build_attributes();
578                // Read type
579                let r#type: CDFDataType = self.get_u32().into();
580                // Read variable size
581                // The 32-bit var_size field is not large enough to contain the size of variables that require
582                // more than 2^32 - 4 bytes, so 2^32 - 1 is used in the var_size field for such variables.
583                let var_size = self.get_u32();
584                // Read offset
585                let offset = self.get_offset();
586                let mut record = false;
587                // Count amount of record variables
588                if !dimensions_ids.is_empty()
589                    && dimensions_ids[0] == self.record_dimension.id.unwrap_or_default()
590                {
591                    record_step += var_size;
592                    record = true;
593                }
594                self.variables.push(CDFVariable {
595                    name,
596                    dimensions: dimensions_ids
597                        .iter()
598                        .map(|id| self.dimensions[*id as usize].clone())
599                        .collect(),
600                    attributes,
601                    r#type,
602                    size: var_size,
603                    offset,
604                    record,
605                });
606                v_idx += 1;
607            }
608        }
609        self.record_dimension.record_step = Some(record_step);
610    }
611
612    /// Read data for the given non-record variable
613    ///
614    /// ## Parameters
615    /// - `variable`: Variable metadata
616    ///
617    /// ## Returns
618    ///  Data of the element
619    fn get_non_record(&self, variable: CDFVariable) -> Vec<CDFValue> {
620        // variable type
621        let CDFVariable { size, r#type, .. } = variable;
622        // size of the data
623        let total_size = size / netcdf_type_to_bytes(r#type);
624        // iterates over the data
625        let mut data = vec![];
626        let mut i = 0;
627        while i < total_size {
628            data.push(self.get_type(r#type, 1));
629            i += 1;
630        }
631
632        data
633    }
634
635    /// Read data for the given record variable
636    ///
637    /// ## Parameters
638    /// - `variable`: Variable metadata
639    ///
640    /// ## Returns
641    /// Data of the element
642    fn get_record(&self, variable: CDFVariable) -> Vec<CDFValue> {
643        // prep variables
644        let CDFRecordDimension { record_step, size: total_size, .. } = self.record_dimension;
645        let CDFVariable { size, r#type, .. } = variable;
646        let width = if size != 0 { size / netcdf_type_to_bytes(r#type) } else { 1 };
647
648        if record_step.is_none() {
649            panic!("record_dimension.record_step is undefined");
650        }
651        let record_step = record_step.unwrap();
652
653        // iterates over the data
654        let mut data = vec![];
655        let mut i = 0;
656        while i < total_size {
657            let current_offset = *self.cursor.borrow();
658            data.push(self.get_type(r#type, width));
659            *self.cursor.borrow_mut() = current_offset + record_step;
660            i += 1;
661        }
662
663        data
664    }
665
666    /// Apply padding as data is mapped to 4-byte alignment
667    fn padding(&self) {
668        let cursor = *self.cursor.borrow();
669        if !cursor.is_multiple_of(4) {
670            *self.cursor.borrow_mut() += 4 - (cursor % 4);
671        }
672    }
673}
674
675/// The NetCDF Iterator tool
676#[derive(Debug)]
677pub struct CDFIterator<'a, T: Reader> {
678    reader: &'a NetCDFReader<T>,
679    index: u64,
680    len: u64,
681}
682impl<T: Reader> Iterator for CDFIterator<'_, T> {
683    type Item = CDFVectorFeature;
684
685    fn next(&mut self) -> Option<Self::Item> {
686        let cdf_reader = &self.reader;
687        if self.index >= self.len {
688            None
689        } else if let Some(point) = cdf_reader.get_feature(self.index) {
690            self.index += 1;
691            Some(point)
692        } else {
693            None
694        }
695    }
696}
697/// A feature reader trait with a callback-based approach
698impl<T: Reader> FeatureReader<(), Properties, MValue> for NetCDFReader<T> {
699    type FeatureIterator<'a>
700        = CDFIterator<'a, T>
701    where
702        T: 'a;
703
704    fn iter(&self) -> Self::FeatureIterator<'_> {
705        CDFIterator { reader: self, index: 0, len: self.len() }
706    }
707
708    fn par_iter(&self, pool_size: usize, thread_id: usize) -> Self::FeatureIterator<'_> {
709        let pool_size = pool_size as u64;
710        let thread_id = thread_id as u64;
711        let start = self.len() * thread_id / pool_size;
712        let end = self.len() * (thread_id + 1) / pool_size;
713        CDFIterator { reader: self, index: start, len: end }
714    }
715}