Skip to main content

oxigdal_hdf5/
dataset.rs

1//! HDF5 dataset handling for multi-dimensional array storage.
2//!
3//! Datasets are multi-dimensional arrays with a fixed datatype and shape.
4//! They can be chunked, compressed, and have associated metadata (attributes).
5
6use crate::attribute::Attributes;
7use crate::datatype::Datatype;
8use crate::error::{Hdf5Error, Result};
9use serde::{Deserialize, Serialize};
10
11/// Dataset layout type
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
13pub enum LayoutType {
14    /// Contiguous layout (all data in a single block)
15    Contiguous,
16    /// Chunked layout (data divided into fixed-size chunks)
17    Chunked,
18    /// Compact layout (data stored in the object header)
19    Compact,
20}
21
22/// Compression filter type
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
24pub enum CompressionFilter {
25    /// No compression
26    None,
27    /// GZIP/DEFLATE compression
28    Gzip {
29        /// Compression level (1-9)
30        level: u8,
31    },
32    /// LZF compression
33    Lzf,
34    /// SZIP compression
35    Szip,
36}
37
38/// Dataset creation properties
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct DatasetProperties {
41    /// Layout type
42    layout: LayoutType,
43    /// Chunk dimensions (only for chunked layout)
44    chunk_dims: Option<Vec<usize>>,
45    /// Compression filter
46    compression: CompressionFilter,
47    /// Fill value
48    fill_value: Option<Vec<u8>>,
49}
50
51impl DatasetProperties {
52    /// Create default properties (contiguous layout, no compression)
53    pub fn new() -> Self {
54        Self {
55            layout: LayoutType::Contiguous,
56            chunk_dims: None,
57            compression: CompressionFilter::None,
58            fill_value: None,
59        }
60    }
61
62    /// Set layout type
63    pub fn with_layout(mut self, layout: LayoutType) -> Self {
64        self.layout = layout;
65        self
66    }
67
68    /// Set chunking (automatically sets layout to Chunked)
69    pub fn with_chunks(mut self, chunk_dims: Vec<usize>) -> Self {
70        self.layout = LayoutType::Chunked;
71        self.chunk_dims = Some(chunk_dims);
72        self
73    }
74
75    /// Set compression
76    pub fn with_compression(mut self, compression: CompressionFilter) -> Self {
77        self.compression = compression;
78        self
79    }
80
81    /// Set GZIP compression
82    pub fn with_gzip(mut self, level: u8) -> Self {
83        let level = level.clamp(1, 9);
84        self.compression = CompressionFilter::Gzip { level };
85        self
86    }
87
88    /// Set fill value
89    pub fn with_fill_value(mut self, fill_value: Vec<u8>) -> Self {
90        self.fill_value = Some(fill_value);
91        self
92    }
93
94    /// Get layout type
95    pub fn layout(&self) -> LayoutType {
96        self.layout
97    }
98
99    /// Get chunk dimensions
100    pub fn chunk_dims(&self) -> Option<&[usize]> {
101        self.chunk_dims.as_deref()
102    }
103
104    /// Get compression filter
105    pub fn compression(&self) -> CompressionFilter {
106        self.compression
107    }
108
109    /// Get fill value
110    pub fn fill_value(&self) -> Option<&[u8]> {
111        self.fill_value.as_deref()
112    }
113
114    /// Validate chunk dimensions against dataset dimensions
115    pub fn validate_chunks(&self, dims: &[usize]) -> Result<()> {
116        if let Some(chunks) = &self.chunk_dims {
117            if chunks.len() != dims.len() {
118                return Err(Hdf5Error::InvalidChunkSize(format!(
119                    "Chunk dimensions ({}) must match dataset dimensions ({})",
120                    chunks.len(),
121                    dims.len()
122                )));
123            }
124
125            for (i, (&chunk_size, &dim_size)) in chunks.iter().zip(dims.iter()).enumerate() {
126                if chunk_size == 0 {
127                    return Err(Hdf5Error::InvalidChunkSize(format!(
128                        "Chunk size at dimension {} cannot be zero",
129                        i
130                    )));
131                }
132                if chunk_size > dim_size {
133                    return Err(Hdf5Error::InvalidChunkSize(format!(
134                        "Chunk size ({}) at dimension {} exceeds dataset size ({})",
135                        chunk_size, i, dim_size
136                    )));
137                }
138            }
139        }
140        Ok(())
141    }
142}
143
144impl Default for DatasetProperties {
145    fn default() -> Self {
146        Self::new()
147    }
148}
149
150/// HDF5 dataset
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct Dataset {
153    /// Dataset name
154    name: String,
155    /// Full path from root
156    path: String,
157    /// Datatype
158    datatype: Datatype,
159    /// Dimensions (shape)
160    dims: Vec<usize>,
161    /// Dataset properties
162    properties: DatasetProperties,
163    /// Attributes
164    attributes: Attributes,
165    /// Raw data (for in-memory datasets)
166    #[serde(skip)]
167    data: Option<Vec<u8>>,
168}
169
170impl Dataset {
171    /// Create a new dataset
172    pub fn new(
173        name: String,
174        path: String,
175        datatype: Datatype,
176        dims: Vec<usize>,
177        properties: DatasetProperties,
178    ) -> Result<Self> {
179        // Validate dimensions
180        if dims.is_empty() {
181            return Err(Hdf5Error::invalid_dimensions(
182                "Dataset must have at least one dimension",
183            ));
184        }
185
186        for (i, &dim) in dims.iter().enumerate() {
187            if dim == 0 {
188                return Err(Hdf5Error::invalid_dimensions(format!(
189                    "Dimension {} cannot be zero",
190                    i
191                )));
192            }
193        }
194
195        // Validate chunk dimensions
196        properties.validate_chunks(&dims)?;
197
198        Ok(Self {
199            name,
200            path,
201            datatype,
202            dims,
203            properties,
204            attributes: Attributes::new(),
205            data: None,
206        })
207    }
208
209    /// Create a dataset with default properties
210    pub fn simple(
211        name: String,
212        path: String,
213        datatype: Datatype,
214        dims: Vec<usize>,
215    ) -> Result<Self> {
216        Self::new(name, path, datatype, dims, DatasetProperties::new())
217    }
218
219    /// Get the dataset name
220    pub fn name(&self) -> &str {
221        &self.name
222    }
223
224    /// Get the full path
225    pub fn path(&self) -> &str {
226        &self.path
227    }
228
229    /// Get the datatype
230    pub fn datatype(&self) -> &Datatype {
231        &self.datatype
232    }
233
234    /// Get the dimensions (shape)
235    pub fn dims(&self) -> &[usize] {
236        &self.dims
237    }
238
239    /// Get the number of dimensions
240    pub fn ndims(&self) -> usize {
241        self.dims.len()
242    }
243
244    /// Get the total number of elements
245    pub fn len(&self) -> usize {
246        self.dims.iter().product()
247    }
248
249    /// Check if empty
250    pub fn is_empty(&self) -> bool {
251        self.len() == 0
252    }
253
254    /// Get the total size in bytes
255    pub fn size_in_bytes(&self) -> usize {
256        self.len() * self.datatype.size()
257    }
258
259    /// Get the dataset properties
260    pub fn properties(&self) -> &DatasetProperties {
261        &self.properties
262    }
263
264    /// Get the attributes
265    pub fn attributes(&self) -> &Attributes {
266        &self.attributes
267    }
268
269    /// Get mutable attributes
270    pub fn attributes_mut(&mut self) -> &mut Attributes {
271        &mut self.attributes
272    }
273
274    /// Set the raw data
275    pub fn set_data(&mut self, data: Vec<u8>) -> Result<()> {
276        let expected_size = self.size_in_bytes();
277        if data.len() != expected_size {
278            return Err(Hdf5Error::InvalidSize(format!(
279                "Data size ({}) does not match expected size ({})",
280                data.len(),
281                expected_size
282            )));
283        }
284        self.data = Some(data);
285        Ok(())
286    }
287
288    /// Get the raw data
289    pub fn data(&self) -> Option<&[u8]> {
290        self.data.as_deref()
291    }
292
293    /// Take the raw data
294    pub fn take_data(&mut self) -> Option<Vec<u8>> {
295        self.data.take()
296    }
297
298    /// Validate slice parameters
299    pub fn validate_slice(&self, start: &[usize], count: &[usize]) -> Result<()> {
300        if start.len() != self.ndims() {
301            return Err(Hdf5Error::invalid_dimensions(format!(
302                "Start dimensions ({}) must match dataset dimensions ({})",
303                start.len(),
304                self.ndims()
305            )));
306        }
307
308        if count.len() != self.ndims() {
309            return Err(Hdf5Error::invalid_dimensions(format!(
310                "Count dimensions ({}) must match dataset dimensions ({})",
311                count.len(),
312                self.ndims()
313            )));
314        }
315
316        for (i, (&s, &c)) in start.iter().zip(count.iter()).enumerate() {
317            if s + c > self.dims[i] {
318                return Err(Hdf5Error::OutOfBounds {
319                    index: s + c,
320                    size: self.dims[i],
321                });
322            }
323        }
324
325        Ok(())
326    }
327
328    /// Calculate the number of elements in a slice
329    pub fn slice_size(&self, count: &[usize]) -> usize {
330        count.iter().product()
331    }
332
333    /// Calculate the size in bytes of a slice
334    pub fn slice_size_bytes(&self, count: &[usize]) -> usize {
335        self.slice_size(count) * self.datatype.size()
336    }
337}
338
339/// Helper functions for creating datasets with common configurations
340impl Dataset {
341    /// Create a 1D dataset
342    pub fn from_1d(name: String, path: String, datatype: Datatype, size: usize) -> Result<Self> {
343        Self::simple(name, path, datatype, vec![size])
344    }
345
346    /// Create a 2D dataset
347    pub fn from_2d(
348        name: String,
349        path: String,
350        datatype: Datatype,
351        rows: usize,
352        cols: usize,
353    ) -> Result<Self> {
354        Self::simple(name, path, datatype, vec![rows, cols])
355    }
356
357    /// Create a 3D dataset
358    pub fn from_3d(
359        name: String,
360        path: String,
361        datatype: Datatype,
362        depth: usize,
363        rows: usize,
364        cols: usize,
365    ) -> Result<Self> {
366        Self::simple(name, path, datatype, vec![depth, rows, cols])
367    }
368
369    /// Create a chunked dataset
370    pub fn chunked(
371        name: String,
372        path: String,
373        datatype: Datatype,
374        dims: Vec<usize>,
375        chunk_dims: Vec<usize>,
376    ) -> Result<Self> {
377        let properties = DatasetProperties::new().with_chunks(chunk_dims);
378        Self::new(name, path, datatype, dims, properties)
379    }
380
381    /// Create a compressed dataset
382    pub fn compressed(
383        name: String,
384        path: String,
385        datatype: Datatype,
386        dims: Vec<usize>,
387        chunk_dims: Vec<usize>,
388        compression: CompressionFilter,
389    ) -> Result<Self> {
390        let properties = DatasetProperties::new()
391            .with_chunks(chunk_dims)
392            .with_compression(compression);
393        Self::new(name, path, datatype, dims, properties)
394    }
395}
396
397#[cfg(test)]
398mod tests {
399    use super::*;
400
401    #[test]
402    fn test_dataset_properties() {
403        let props = DatasetProperties::new();
404        assert_eq!(props.layout(), LayoutType::Contiguous);
405        assert!(props.chunk_dims().is_none());
406        assert_eq!(props.compression(), CompressionFilter::None);
407
408        let props = DatasetProperties::new()
409            .with_chunks(vec![10, 10])
410            .with_gzip(6);
411        assert_eq!(props.layout(), LayoutType::Chunked);
412        assert_eq!(props.chunk_dims(), Some(&[10, 10][..]));
413        assert_eq!(props.compression(), CompressionFilter::Gzip { level: 6 });
414    }
415
416    #[test]
417    fn test_dataset_creation() {
418        let dataset = Dataset::simple(
419            "data".to_string(),
420            "/data".to_string(),
421            Datatype::Float32,
422            vec![100, 200],
423        )
424        .expect("Failed to create dataset");
425
426        assert_eq!(dataset.name(), "data");
427        assert_eq!(dataset.path(), "/data");
428        assert_eq!(dataset.datatype(), &Datatype::Float32);
429        assert_eq!(dataset.dims(), &[100, 200]);
430        assert_eq!(dataset.ndims(), 2);
431        assert_eq!(dataset.len(), 20000);
432        assert_eq!(dataset.size_in_bytes(), 80000); // 20000 * 4 bytes
433    }
434
435    #[test]
436    fn test_dataset_1d() {
437        let dataset = Dataset::from_1d(
438            "data".to_string(),
439            "/data".to_string(),
440            Datatype::Int32,
441            100,
442        )
443        .expect("Failed to create dataset");
444
445        assert_eq!(dataset.dims(), &[100]);
446        assert_eq!(dataset.len(), 100);
447    }
448
449    #[test]
450    fn test_dataset_2d() {
451        let dataset = Dataset::from_2d(
452            "data".to_string(),
453            "/data".to_string(),
454            Datatype::Float64,
455            50,
456            100,
457        )
458        .expect("Failed to create dataset");
459
460        assert_eq!(dataset.dims(), &[50, 100]);
461        assert_eq!(dataset.len(), 5000);
462    }
463
464    #[test]
465    fn test_dataset_3d() {
466        let dataset = Dataset::from_3d(
467            "data".to_string(),
468            "/data".to_string(),
469            Datatype::UInt8,
470            10,
471            20,
472            30,
473        )
474        .expect("Failed to create dataset");
475
476        assert_eq!(dataset.dims(), &[10, 20, 30]);
477        assert_eq!(dataset.len(), 6000);
478    }
479
480    #[test]
481    fn test_dataset_chunked() {
482        let dataset = Dataset::chunked(
483            "data".to_string(),
484            "/data".to_string(),
485            Datatype::Float32,
486            vec![100, 200],
487            vec![10, 20],
488        )
489        .expect("Failed to create dataset");
490
491        assert_eq!(dataset.properties().layout(), LayoutType::Chunked);
492        assert_eq!(dataset.properties().chunk_dims(), Some(&[10, 20][..]));
493    }
494
495    #[test]
496    fn test_dataset_compressed() {
497        let dataset = Dataset::compressed(
498            "data".to_string(),
499            "/data".to_string(),
500            Datatype::Float64,
501            vec![100, 200],
502            vec![10, 20],
503            CompressionFilter::Gzip { level: 6 },
504        )
505        .expect("Failed to create dataset");
506
507        assert_eq!(dataset.properties().layout(), LayoutType::Chunked);
508        assert_eq!(
509            dataset.properties().compression(),
510            CompressionFilter::Gzip { level: 6 }
511        );
512    }
513
514    #[test]
515    fn test_dataset_validate_slice() {
516        let dataset = Dataset::from_2d(
517            "data".to_string(),
518            "/data".to_string(),
519            Datatype::Int32,
520            100,
521            200,
522        )
523        .expect("Failed to create dataset");
524
525        assert!(dataset.validate_slice(&[0, 0], &[50, 100]).is_ok());
526        assert!(dataset.validate_slice(&[50, 100], &[50, 100]).is_ok());
527        assert!(dataset.validate_slice(&[0, 0], &[100, 200]).is_ok());
528        assert!(dataset.validate_slice(&[0, 0], &[101, 200]).is_err());
529        assert!(dataset.validate_slice(&[50, 100], &[51, 100]).is_err());
530    }
531
532    #[test]
533    fn test_dataset_slice_size() {
534        let dataset = Dataset::from_2d(
535            "data".to_string(),
536            "/data".to_string(),
537            Datatype::Int32,
538            100,
539            200,
540        )
541        .expect("Failed to create dataset");
542
543        assert_eq!(dataset.slice_size(&[50, 100]), 5000);
544        assert_eq!(dataset.slice_size_bytes(&[50, 100]), 20000); // 5000 * 4 bytes
545    }
546
547    #[test]
548    fn test_dataset_set_data() {
549        let mut dataset =
550            Dataset::from_1d("data".to_string(), "/data".to_string(), Datatype::Int32, 10)
551                .expect("Failed to create dataset");
552
553        let data = vec![0u8; 40]; // 10 * 4 bytes
554        assert!(dataset.set_data(data).is_ok());
555
556        let wrong_size_data = vec![0u8; 50];
557        assert!(dataset.set_data(wrong_size_data).is_err());
558    }
559}