Skip to main content

rust_hdf5/
dataset.rs

1//! Dataset creation and I/O.
2//!
3//! Datasets are created via the fluent [`DatasetBuilder`] API obtained from
4//! [`H5File::new_dataset`](crate::file::H5File::new_dataset). Once created,
5//! the [`H5Dataset`] handle can read or write raw typed data.
6
7use crate::attribute::AttrBuilder;
8use crate::error::{Hdf5Error, Result};
9use crate::file::{borrow_inner, borrow_inner_mut, clone_inner, H5FileInner, SharedInner};
10use crate::types::H5Type;
11
12// ---------------------------------------------------------------------------
13// DatasetBuilder
14// ---------------------------------------------------------------------------
15
16/// A fluent builder for creating datasets.
17///
18/// Obtained from [`H5File::new_dataset::<T>()`](crate::file::H5File::new_dataset).
19///
20/// ```no_run
21/// # use rust_hdf5::H5File;
22/// let file = H5File::create("builder.h5").unwrap();
23/// let ds = file.new_dataset::<f32>()
24///     .shape(&[10, 20])
25///     .create("temperatures")
26///     .unwrap();
27/// ```
28pub struct DatasetBuilder<T: H5Type> {
29    file_inner: SharedInner,
30    shape: Option<Vec<usize>>,
31    chunk_dims: Option<Vec<usize>>,
32    max_shape: Option<Vec<Option<usize>>>,
33    deflate_level: Option<u32>,
34    shuffle_deflate_level: Option<u32>,
35    custom_pipeline: Option<crate::format::messages::filter::FilterPipeline>,
36    group_path: Option<String>,
37    _marker: std::marker::PhantomData<T>,
38}
39
40impl<T: H5Type> DatasetBuilder<T> {
41    pub(crate) fn new(file_inner: SharedInner) -> Self {
42        Self {
43            file_inner,
44            shape: None,
45            chunk_dims: None,
46            max_shape: None,
47            deflate_level: None,
48            shuffle_deflate_level: None,
49            custom_pipeline: None,
50            group_path: None,
51            _marker: std::marker::PhantomData,
52        }
53    }
54
55    pub(crate) fn new_in_group(file_inner: SharedInner, group_path: String) -> Self {
56        Self {
57            file_inner,
58            shape: None,
59            chunk_dims: None,
60            max_shape: None,
61            deflate_level: None,
62            shuffle_deflate_level: None,
63            custom_pipeline: None,
64            group_path: Some(group_path),
65            _marker: std::marker::PhantomData,
66        }
67    }
68
69    /// Set the dataset dimensions.
70    ///
71    /// This is required before calling [`create`](Self::create).
72    /// Use an empty slice `&[]` for a scalar (0-dimensional) dataset.
73    #[must_use]
74    pub fn shape<S: AsRef<[usize]>>(mut self, dims: S) -> Self {
75        self.shape = Some(dims.as_ref().to_vec());
76        self
77    }
78
79    /// Create a scalar (0-dimensional) dataset holding a single value.
80    #[must_use]
81    pub fn scalar(mut self) -> Self {
82        self.shape = Some(vec![]);
83        self
84    }
85
86    /// Set chunk dimensions for chunked storage.
87    ///
88    /// When set, the dataset uses chunked storage with the extensible array
89    /// index. You should also call [`max_shape`](Self::max_shape) or
90    /// [`resizable`](Self::resizable) to allow extending.
91    #[must_use]
92    pub fn chunk(mut self, chunk_dims: &[usize]) -> Self {
93        self.chunk_dims = Some(chunk_dims.to_vec());
94        self
95    }
96
97    /// Make all dimensions unlimited (resizable).
98    ///
99    /// This sets max_dims to u64::MAX for all dimensions.
100    #[must_use]
101    pub fn resizable(mut self) -> Self {
102        self.max_shape = Some(vec![None; self.shape.as_ref().map_or(0, |s| s.len())]);
103        self
104    }
105
106    /// Set maximum dimensions. `None` means unlimited for that dimension.
107    #[must_use]
108    pub fn max_shape(mut self, max: &[Option<usize>]) -> Self {
109        self.max_shape = Some(max.to_vec());
110        self
111    }
112
113    /// Enable deflate (gzip) compression with the given level (0-9).
114    ///
115    /// Requires chunked storage (call `.chunk()` before `.create()`).
116    /// Level 0 = no compression, 9 = maximum compression. Default is 6.
117    #[must_use]
118    pub fn deflate(mut self, level: u32) -> Self {
119        self.deflate_level = Some(level);
120        self
121    }
122
123    /// Enable shuffle + deflate compression.
124    ///
125    /// Shuffle reorders bytes by position within elements before compression,
126    /// which typically improves compression ratios for numeric data.
127    /// Requires chunked storage.
128    #[must_use]
129    pub fn shuffle_deflate(mut self, level: u32) -> Self {
130        self.shuffle_deflate_level = Some(level);
131        self
132    }
133
134    /// Enable Zstandard compression with the given level (1-22, default 3).
135    ///
136    /// Requires chunked storage (call `.chunk()` before `.create()`).
137    #[must_use]
138    pub fn zstd(mut self, level: u32) -> Self {
139        self.custom_pipeline = Some(crate::format::messages::filter::FilterPipeline::zstd(level));
140        self
141    }
142
143    /// Set a custom filter pipeline for compression.
144    ///
145    /// This takes precedence over [`deflate`](Self::deflate) and
146    /// [`shuffle_deflate`](Self::shuffle_deflate). Requires chunked storage.
147    #[must_use]
148    pub fn filter_pipeline(
149        mut self,
150        pipeline: crate::format::messages::filter::FilterPipeline,
151    ) -> Self {
152        self.custom_pipeline = Some(pipeline);
153        self
154    }
155
156    /// Finalize and create the dataset with the given `name`.
157    ///
158    /// The name is the link name within the root group (e.g. `"data"` or
159    /// `"group1/data"` once nested groups are supported).
160    pub fn create(self, name: &str) -> Result<H5Dataset> {
161        let shape = self.shape.ok_or_else(|| {
162            Hdf5Error::InvalidState("shape must be set before calling create()".into())
163        })?;
164
165        // Build the full name: if created within a group, prefix with group path
166        let full_name = if let Some(ref gp) = self.group_path {
167            if gp == "/" {
168                name.to_string()
169            } else {
170                let trimmed = gp.trim_start_matches('/');
171                format!("{}/{}", trimmed, name)
172            }
173        } else {
174            name.to_string()
175        };
176        let group_path = self.group_path.clone();
177
178        let dims_u64: Vec<u64> = shape.iter().map(|&d| d as u64).collect();
179        let datatype = T::hdf5_type();
180        let element_size = T::element_size();
181
182        if let Some(ref chunk_dims) = self.chunk_dims {
183            // Chunked dataset
184            let chunk_u64: Vec<u64> = chunk_dims.iter().map(|&d| d as u64).collect();
185            let max_u64: Vec<u64> = if let Some(ref max) = self.max_shape {
186                max.iter()
187                    .map(|m| m.map_or(u64::MAX, |v| v as u64))
188                    .collect()
189            } else {
190                // Default: max = current
191                dims_u64.clone()
192            };
193
194            let index = {
195                let mut inner = borrow_inner_mut(&self.file_inner);
196                match &mut *inner {
197                    H5FileInner::Writer(writer) => {
198                        let idx = if let Some(pipeline) = self.custom_pipeline {
199                            writer.create_chunked_dataset_with_pipeline(
200                                &full_name, datatype, &dims_u64, &max_u64, &chunk_u64, pipeline,
201                            )?
202                        } else if let Some(level) = self.shuffle_deflate_level {
203                            let pipeline =
204                                crate::format::messages::filter::FilterPipeline::shuffle_deflate(
205                                    T::element_size() as u32,
206                                    level,
207                                );
208                            writer.create_chunked_dataset_with_pipeline(
209                                &full_name, datatype, &dims_u64, &max_u64, &chunk_u64, pipeline,
210                            )?
211                        } else if let Some(level) = self.deflate_level {
212                            writer.create_chunked_dataset_compressed(
213                                &full_name, datatype, &dims_u64, &max_u64, &chunk_u64, level,
214                            )?
215                        } else {
216                            writer.create_chunked_dataset(
217                                &full_name, datatype, &dims_u64, &max_u64, &chunk_u64,
218                            )?
219                        };
220                        if let Some(ref gp) = group_path {
221                            if gp != "/" {
222                                writer.assign_dataset_to_group(gp, idx)?;
223                            }
224                        }
225                        idx
226                    }
227                    H5FileInner::Reader(_) => {
228                        return Err(Hdf5Error::InvalidState(
229                            "cannot create a dataset in read mode".into(),
230                        ));
231                    }
232                    H5FileInner::Closed => {
233                        return Err(Hdf5Error::InvalidState("file is closed".into()));
234                    }
235                }
236            };
237
238            Ok(H5Dataset {
239                file_inner: clone_inner(&self.file_inner),
240                info: DatasetInfo::Writer {
241                    index,
242                    shape,
243                    element_size,
244                    chunked: true,
245                },
246            })
247        } else {
248            // Contiguous dataset (original path)
249            let index = {
250                let mut inner = borrow_inner_mut(&self.file_inner);
251                match &mut *inner {
252                    H5FileInner::Writer(writer) => {
253                        let idx = writer.create_dataset(&full_name, datatype, &dims_u64)?;
254                        if let Some(ref gp) = group_path {
255                            if gp != "/" {
256                                writer.assign_dataset_to_group(gp, idx)?;
257                            }
258                        }
259                        idx
260                    }
261                    H5FileInner::Reader(_) => {
262                        return Err(Hdf5Error::InvalidState(
263                            "cannot create a dataset in read mode".into(),
264                        ));
265                    }
266                    H5FileInner::Closed => {
267                        return Err(Hdf5Error::InvalidState("file is closed".into()));
268                    }
269                }
270            };
271
272            Ok(H5Dataset {
273                file_inner: clone_inner(&self.file_inner),
274                info: DatasetInfo::Writer {
275                    index,
276                    shape,
277                    element_size,
278                    chunked: false,
279                },
280            })
281        }
282    }
283}
284
285// ---------------------------------------------------------------------------
286// DatasetInfo
287// ---------------------------------------------------------------------------
288
289/// Internal metadata about a dataset handle.
290enum DatasetInfo {
291    /// A dataset created via `new_dataset().create()` in write mode.
292    Writer {
293        /// Index into the writer's dataset list.
294        index: usize,
295        /// Shape (current dimensions).
296        shape: Vec<usize>,
297        /// Size of one element in bytes.
298        element_size: usize,
299        /// Whether this is a chunked dataset.
300        chunked: bool,
301    },
302    /// A dataset opened by name in read mode.
303    Reader {
304        /// The link name of the dataset.
305        name: String,
306        /// Shape (current dimensions).
307        shape: Vec<usize>,
308        /// Size of one element in bytes.
309        element_size: usize,
310    },
311}
312
313// ---------------------------------------------------------------------------
314// H5Dataset
315// ---------------------------------------------------------------------------
316
317/// A handle to an HDF5 dataset, supporting typed read and write operations.
318///
319/// The dataset holds a shared reference to the file's I/O backend, so it
320/// remains valid even if the originating [`H5File`](crate::file::H5File) is
321/// moved or dropped (they share ownership via `Rc`).
322pub struct H5Dataset {
323    file_inner: SharedInner,
324    info: DatasetInfo,
325}
326
327impl H5Dataset {
328    /// Create a reader-mode dataset handle (called internally by `H5File::dataset`).
329    pub(crate) fn new_reader(
330        file_inner: SharedInner,
331        name: String,
332        shape: Vec<usize>,
333        element_size: usize,
334    ) -> Self {
335        Self {
336            file_inner,
337            info: DatasetInfo::Reader {
338                name,
339                shape,
340                element_size,
341            },
342        }
343    }
344
345    /// Return the dataset dimensions.
346    pub fn shape(&self) -> Vec<usize> {
347        match &self.info {
348            DatasetInfo::Writer { shape, .. } => shape.clone(),
349            DatasetInfo::Reader { shape, .. } => shape.clone(),
350        }
351    }
352
353    /// Return the number of dimensions (rank) of the dataset.
354    pub fn ndims(&self) -> usize {
355        match &self.info {
356            DatasetInfo::Writer { shape, .. } => shape.len(),
357            DatasetInfo::Reader { shape, .. } => shape.len(),
358        }
359    }
360
361    /// Return the total number of elements in the dataset.
362    pub fn total_elements(&self) -> usize {
363        match &self.info {
364            DatasetInfo::Writer { shape, .. } => shape.iter().product(),
365            DatasetInfo::Reader { shape, .. } => shape.iter().product(),
366        }
367    }
368
369    /// Return the size of one element in bytes.
370    pub fn element_size(&self) -> usize {
371        match &self.info {
372            DatasetInfo::Writer { element_size, .. } => *element_size,
373            DatasetInfo::Reader { element_size, .. } => *element_size,
374        }
375    }
376
377    /// Return the chunk dimensions, if this is a chunked dataset.
378    pub fn chunk_dims(&self) -> Option<Vec<usize>> {
379        match &self.info {
380            DatasetInfo::Reader { name, .. } => {
381                let inner = borrow_inner(&self.file_inner);
382                if let H5FileInner::Reader(reader) = &*inner {
383                    if let Some(info) = reader.dataset_info(name) {
384                        if let crate::format::messages::data_layout::DataLayoutMessage::ChunkedV4 {
385                            chunk_dims,
386                            ..
387                        } = &info.layout
388                        {
389                            // Strip trailing element-size dimension
390                            return Some(
391                                chunk_dims[..chunk_dims.len() - 1]
392                                    .iter()
393                                    .map(|&d| d as usize)
394                                    .collect(),
395                            );
396                        }
397                    }
398                }
399                None
400            }
401            DatasetInfo::Writer { .. } => None,
402        }
403    }
404
405    /// Return whether this is a chunked dataset.
406    pub fn is_chunked(&self) -> bool {
407        match &self.info {
408            DatasetInfo::Writer { chunked, .. } => *chunked,
409            DatasetInfo::Reader { name, .. } => {
410                let inner = borrow_inner(&self.file_inner);
411                match &*inner {
412                    H5FileInner::Reader(reader) => {
413                        if let Some(info) = reader.dataset_info(name) {
414                            matches!(
415                                info.layout,
416                                crate::format::messages::data_layout::DataLayoutMessage::ChunkedV4 { .. }
417                            )
418                        } else {
419                            false
420                        }
421                    }
422                    _ => false,
423                }
424            }
425        }
426    }
427
428    /// Return the names of all attributes on this dataset (read mode only).
429    pub fn attr_names(&self) -> Result<Vec<String>> {
430        match &self.info {
431            DatasetInfo::Reader { name, .. } => {
432                let inner = borrow_inner(&self.file_inner);
433                match &*inner {
434                    H5FileInner::Reader(reader) => Ok(reader.dataset_attr_names(name)?),
435                    _ => Err(Hdf5Error::InvalidState("file is not in read mode".into())),
436                }
437            }
438            DatasetInfo::Writer { .. } => Err(Hdf5Error::InvalidState(
439                "attr_names not available in write mode".into(),
440            )),
441        }
442    }
443
444    /// Open an attribute by name (read mode only).
445    pub fn attr(&self, attr_name: &str) -> Result<crate::attribute::H5Attribute> {
446        match &self.info {
447            DatasetInfo::Reader { name, .. } => {
448                let inner = borrow_inner(&self.file_inner);
449                match &*inner {
450                    H5FileInner::Reader(reader) => {
451                        let attr_msg = reader.dataset_attr(name, attr_name)?;
452                        Ok(crate::attribute::H5Attribute::new_reader(
453                            clone_inner(&self.file_inner),
454                            attr_msg.name.clone(),
455                            attr_msg.data.clone(),
456                        ))
457                    }
458                    _ => Err(Hdf5Error::InvalidState("file is not in read mode".into())),
459                }
460            }
461            DatasetInfo::Writer { .. } => Err(Hdf5Error::InvalidState(
462                "attr() not available in write mode".into(),
463            )),
464        }
465    }
466
467    /// Start building a new attribute on this dataset.
468    ///
469    /// Returns a fluent builder. Call `.shape(())` for a scalar attribute
470    /// and `.create("name")` to finalize.
471    ///
472    /// # Example
473    ///
474    /// ```no_run
475    /// # use rust_hdf5::H5File;
476    /// # use rust_hdf5::types::VarLenUnicode;
477    /// let file = H5File::create("attr.h5").unwrap();
478    /// let ds = file.new_dataset::<f32>().shape(&[10]).create("data").unwrap();
479    /// let attr = ds.new_attr::<VarLenUnicode>().shape(()).create("units").unwrap();
480    /// attr.write_scalar(&VarLenUnicode("meters".to_string())).unwrap();
481    /// ```
482    pub fn new_attr<T: 'static>(&self) -> AttrBuilder<'_, T> {
483        let ds_index = match &self.info {
484            DatasetInfo::Writer { index, .. } => *index,
485            DatasetInfo::Reader { .. } => {
486                // Reader mode: we'll return a builder that will error on create.
487                // Using usize::MAX as sentinel.
488                usize::MAX
489            }
490        };
491        AttrBuilder::new(&self.file_inner, ds_index)
492    }
493
494    /// Write a typed slice to the dataset (contiguous datasets only).
495    ///
496    /// The slice length must match the total number of elements declared by
497    /// the dataset shape. The data is reinterpreted as raw bytes and written
498    /// to the file.
499    ///
500    /// # Errors
501    ///
502    /// Returns an error if:
503    /// - The file is in read mode.
504    /// - The data length does not match the declared shape.
505    pub fn write_raw<T: H5Type>(&self, data: &[T]) -> Result<()> {
506        match &self.info {
507            DatasetInfo::Writer {
508                index,
509                shape,
510                element_size,
511                chunked,
512            } => {
513                if *chunked {
514                    return Err(Hdf5Error::InvalidState(
515                        "use write_chunk for chunked datasets".into(),
516                    ));
517                }
518
519                let total_elements: usize = shape.iter().product();
520                if data.len() != total_elements {
521                    return Err(Hdf5Error::InvalidState(format!(
522                        "data length {} does not match dataset size {}",
523                        data.len(),
524                        total_elements,
525                    )));
526                }
527
528                // Verify element size matches
529                if T::element_size() != *element_size {
530                    return Err(Hdf5Error::TypeMismatch(format!(
531                        "write type has element size {} but dataset expects {}",
532                        T::element_size(),
533                        element_size,
534                    )));
535                }
536
537                // Safety: T: Copy + 'static (numeric primitive) with well-defined
538                // byte representation. The resulting slice borrows `data` and
539                // lives only as long as this block.
540                let byte_len = data.len() * T::element_size();
541                let raw =
542                    unsafe { std::slice::from_raw_parts(data.as_ptr() as *const u8, byte_len) };
543
544                let mut inner = borrow_inner_mut(&self.file_inner);
545                match &mut *inner {
546                    H5FileInner::Writer(writer) => {
547                        writer.write_dataset_raw(*index, raw)?;
548                        Ok(())
549                    }
550                    _ => Err(Hdf5Error::InvalidState(
551                        "file is no longer in write mode".into(),
552                    )),
553                }
554            }
555            DatasetInfo::Reader { .. } => Err(Hdf5Error::InvalidState(
556                "cannot write to a dataset opened in read mode".into(),
557            )),
558        }
559    }
560
561    /// Write a single chunk to a chunked dataset.
562    ///
563    /// `chunk_idx` is the linear chunk index (typically the frame number for
564    /// streaming datasets). `data` is the raw byte data for one chunk.
565    pub fn write_chunk(&self, chunk_idx: usize, data: &[u8]) -> Result<()> {
566        match &self.info {
567            DatasetInfo::Writer { index, chunked, .. } => {
568                if !*chunked {
569                    return Err(Hdf5Error::InvalidState(
570                        "write_chunk is only for chunked datasets".into(),
571                    ));
572                }
573
574                let mut inner = borrow_inner_mut(&self.file_inner);
575                match &mut *inner {
576                    H5FileInner::Writer(writer) => {
577                        writer.write_chunk(*index, chunk_idx as u64, data)?;
578                        Ok(())
579                    }
580                    _ => Err(Hdf5Error::InvalidState(
581                        "file is no longer in write mode".into(),
582                    )),
583                }
584            }
585            DatasetInfo::Reader { .. } => {
586                Err(Hdf5Error::InvalidState("cannot write in read mode".into()))
587            }
588        }
589    }
590
591    /// Write multiple chunks in a batch, optionally compressing in parallel.
592    ///
593    /// `chunks` is a slice of `(chunk_index, raw_data)` pairs. When a filter
594    /// pipeline is configured and the `parallel` feature is enabled, all
595    /// chunks are compressed concurrently via rayon.
596    pub fn write_chunks_batch(&self, chunks: &[(usize, &[u8])]) -> Result<()> {
597        match &self.info {
598            DatasetInfo::Writer { index, chunked, .. } => {
599                if !*chunked {
600                    return Err(Hdf5Error::InvalidState(
601                        "write_chunks_batch is only for chunked datasets".into(),
602                    ));
603                }
604                let pairs: Vec<(u64, &[u8])> = chunks
605                    .iter()
606                    .map(|(idx, data)| (*idx as u64, *data))
607                    .collect();
608                let mut inner = borrow_inner_mut(&self.file_inner);
609                match &mut *inner {
610                    H5FileInner::Writer(writer) => {
611                        writer.write_chunks_batch(*index, &pairs)?;
612                        Ok(())
613                    }
614                    _ => Err(Hdf5Error::InvalidState(
615                        "file is no longer in write mode".into(),
616                    )),
617                }
618            }
619            DatasetInfo::Reader { .. } => {
620                Err(Hdf5Error::InvalidState("cannot write in read mode".into()))
621            }
622        }
623    }
624
625    /// Extend the dimensions of a chunked dataset.
626    pub fn extend(&self, new_dims: &[usize]) -> Result<()> {
627        match &self.info {
628            DatasetInfo::Writer { index, chunked, .. } => {
629                if !*chunked {
630                    return Err(Hdf5Error::InvalidState(
631                        "extend is only for chunked datasets".into(),
632                    ));
633                }
634
635                let dims_u64: Vec<u64> = new_dims.iter().map(|&d| d as u64).collect();
636                let mut inner = borrow_inner_mut(&self.file_inner);
637                match &mut *inner {
638                    H5FileInner::Writer(writer) => {
639                        writer.extend_dataset(*index, &dims_u64)?;
640                        Ok(())
641                    }
642                    _ => Err(Hdf5Error::InvalidState(
643                        "file is no longer in write mode".into(),
644                    )),
645                }
646            }
647            DatasetInfo::Reader { .. } => {
648                Err(Hdf5Error::InvalidState("cannot extend in read mode".into()))
649            }
650        }
651    }
652
653    /// Flush a chunked dataset's index structures to disk.
654    pub fn flush(&self) -> Result<()> {
655        match &self.info {
656            DatasetInfo::Writer { index, .. } => {
657                let mut inner = borrow_inner_mut(&self.file_inner);
658                match &mut *inner {
659                    H5FileInner::Writer(writer) => {
660                        writer.flush_dataset(*index)?;
661                        Ok(())
662                    }
663                    _ => Ok(()),
664                }
665            }
666            DatasetInfo::Reader { .. } => Ok(()),
667        }
668    }
669
670    /// Read a slice (hyperslab) of the dataset as a typed vector.
671    ///
672    /// `starts` and `counts` define the N-dimensional selection:
673    /// `starts[d]` = first index along dim d, `counts[d]` = how many elements.
674    pub fn read_slice<T: H5Type>(&self, starts: &[usize], counts: &[usize]) -> Result<Vec<T>> {
675        match &self.info {
676            DatasetInfo::Reader {
677                name, element_size, ..
678            } => {
679                if T::element_size() != *element_size {
680                    return Err(Hdf5Error::TypeMismatch(format!(
681                        "read type has element size {} but dataset has element size {}",
682                        T::element_size(),
683                        element_size,
684                    )));
685                }
686                let starts_u64: Vec<u64> = starts.iter().map(|&s| s as u64).collect();
687                let counts_u64: Vec<u64> = counts.iter().map(|&c| c as u64).collect();
688
689                let raw = {
690                    let mut inner = borrow_inner_mut(&self.file_inner);
691                    match &mut *inner {
692                        H5FileInner::Reader(reader) => {
693                            reader.read_slice(name, &starts_u64, &counts_u64)?
694                        }
695                        _ => {
696                            return Err(Hdf5Error::InvalidState("file is not in read mode".into()))
697                        }
698                    }
699                };
700
701                if raw.len() % T::element_size() != 0 {
702                    return Err(Hdf5Error::TypeMismatch(format!(
703                        "raw data size {} is not a multiple of element size {}",
704                        raw.len(),
705                        T::element_size(),
706                    )));
707                }
708
709                let count = raw.len() / T::element_size();
710                let mut result = Vec::<T>::with_capacity(count);
711                unsafe {
712                    std::ptr::copy_nonoverlapping(
713                        raw.as_ptr(),
714                        result.as_mut_ptr() as *mut u8,
715                        raw.len(),
716                    );
717                    result.set_len(count);
718                }
719                Ok(result)
720            }
721            DatasetInfo::Writer { .. } => Err(Hdf5Error::InvalidState(
722                "cannot read_slice from a dataset in write mode".into(),
723            )),
724        }
725    }
726
727    /// Write a typed slice to a sub-region of a contiguous dataset.
728    ///
729    /// `starts` and `counts` define the N-dimensional selection.
730    pub fn write_slice<T: H5Type>(
731        &self,
732        starts: &[usize],
733        counts: &[usize],
734        data: &[T],
735    ) -> Result<()> {
736        match &self.info {
737            DatasetInfo::Writer {
738                index,
739                element_size,
740                chunked,
741                ..
742            } => {
743                if *chunked {
744                    return Err(Hdf5Error::InvalidState(
745                        "write_slice is only for contiguous datasets".into(),
746                    ));
747                }
748                if T::element_size() != *element_size {
749                    return Err(Hdf5Error::TypeMismatch(format!(
750                        "write type has element size {} but dataset expects {}",
751                        T::element_size(),
752                        element_size,
753                    )));
754                }
755
756                let expected: usize = counts.iter().product();
757                if data.len() != expected {
758                    return Err(Hdf5Error::InvalidState(format!(
759                        "data length {} does not match slice size {}",
760                        data.len(),
761                        expected,
762                    )));
763                }
764
765                let starts_u64: Vec<u64> = starts.iter().map(|&s| s as u64).collect();
766                let counts_u64: Vec<u64> = counts.iter().map(|&c| c as u64).collect();
767
768                let byte_len = data.len() * T::element_size();
769                let raw =
770                    unsafe { std::slice::from_raw_parts(data.as_ptr() as *const u8, byte_len) };
771
772                let mut inner = borrow_inner_mut(&self.file_inner);
773                match &mut *inner {
774                    H5FileInner::Writer(writer) => {
775                        writer.write_slice(*index, &starts_u64, &counts_u64, raw)?;
776                        Ok(())
777                    }
778                    _ => Err(Hdf5Error::InvalidState(
779                        "file is no longer in write mode".into(),
780                    )),
781                }
782            }
783            DatasetInfo::Reader { .. } => {
784                Err(Hdf5Error::InvalidState("cannot write in read mode".into()))
785            }
786        }
787    }
788
789    /// Read variable-length strings from a dataset.
790    ///
791    /// This handles h5py-style vlen string datasets that store strings
792    /// as global heap references. Returns one String per element.
793    pub fn read_vlen_strings(&self) -> Result<Vec<String>> {
794        match &self.info {
795            DatasetInfo::Reader { name, .. } => {
796                let mut inner = borrow_inner_mut(&self.file_inner);
797                match &mut *inner {
798                    H5FileInner::Reader(reader) => Ok(reader.read_vlen_strings(name)?),
799                    _ => Err(Hdf5Error::InvalidState("file is not in read mode".into())),
800                }
801            }
802            DatasetInfo::Writer { .. } => Err(Hdf5Error::InvalidState(
803                "cannot read vlen strings from a dataset in write mode".into(),
804            )),
805        }
806    }
807
808    /// Read the entire dataset as a typed vector.
809    ///
810    /// The raw bytes are read from the file and reinterpreted as `T`. The
811    /// caller must ensure that `T` matches the datatype used when the dataset
812    /// was written.
813    ///
814    /// # Errors
815    ///
816    /// Returns an error if:
817    /// - The file is in write mode.
818    /// - The raw data size is not a multiple of `T::element_size()`.
819    pub fn read_raw<T: H5Type>(&self) -> Result<Vec<T>> {
820        match &self.info {
821            DatasetInfo::Reader {
822                name, element_size, ..
823            } => {
824                if T::element_size() != *element_size {
825                    return Err(Hdf5Error::TypeMismatch(format!(
826                        "read type has element size {} but dataset has element size {}",
827                        T::element_size(),
828                        element_size,
829                    )));
830                }
831
832                let raw = {
833                    let mut inner = borrow_inner_mut(&self.file_inner);
834                    match &mut *inner {
835                        H5FileInner::Reader(reader) => reader.read_dataset_raw(name)?,
836                        _ => {
837                            return Err(Hdf5Error::InvalidState("file is not in read mode".into()));
838                        }
839                    }
840                };
841
842                if raw.len() % T::element_size() != 0 {
843                    return Err(Hdf5Error::TypeMismatch(format!(
844                        "raw data size {} is not a multiple of element size {}",
845                        raw.len(),
846                        T::element_size(),
847                    )));
848                }
849
850                let count = raw.len() / T::element_size();
851                let mut result = Vec::<T>::with_capacity(count);
852
853                // Safety: T is Copy + 'static (required by H5Type). We verified
854                // the byte count matches count * size_of::<T>() above.
855                // copy_nonoverlapping fills the memory with valid bit patterns
856                // for all H5Type implementors (numeric primitives).
857                // We call set_len AFTER the copy so that if an unexpected panic
858                // occurs, uninitialized memory is never exposed.
859                unsafe {
860                    std::ptr::copy_nonoverlapping(
861                        raw.as_ptr(),
862                        result.as_mut_ptr() as *mut u8,
863                        raw.len(),
864                    );
865                    result.set_len(count);
866                }
867
868                Ok(result)
869            }
870            DatasetInfo::Writer { .. } => Err(Hdf5Error::InvalidState(
871                "cannot read from a dataset in write mode".into(),
872            )),
873        }
874    }
875}
876
877#[cfg(test)]
878mod tests {
879    use crate::H5File;
880    use std::path::PathBuf;
881
882    fn temp_path(name: &str) -> PathBuf {
883        std::env::temp_dir().join(format!("hdf5_dataset_test_{}.h5", name))
884    }
885
886    #[test]
887    fn builder_requires_shape() {
888        let path = temp_path("no_shape");
889        let file = H5File::create(&path).unwrap();
890        let result = file.new_dataset::<u8>().create("data");
891        assert!(result.is_err());
892        std::fs::remove_file(&path).ok();
893    }
894
895    #[test]
896    fn write_raw_size_mismatch() {
897        let path = temp_path("size_mismatch");
898        let file = H5File::create(&path).unwrap();
899        let ds = file.new_dataset::<u8>().shape([4]).create("data").unwrap();
900        // Provide 3 elements instead of 4
901        let result = ds.write_raw(&[1u8, 2, 3]);
902        assert!(result.is_err());
903        std::fs::remove_file(&path).ok();
904    }
905
906    #[test]
907    fn roundtrip_u8_1d() {
908        let path = temp_path("rt_u8_1d");
909        let data: Vec<u8> = (0..10).collect();
910
911        {
912            let file = H5File::create(&path).unwrap();
913            let ds = file.new_dataset::<u8>().shape([10]).create("seq").unwrap();
914            ds.write_raw(&data).unwrap();
915            file.close().unwrap();
916        }
917
918        {
919            let file = H5File::open(&path).unwrap();
920            let ds = file.dataset("seq").unwrap();
921            assert_eq!(ds.shape(), vec![10]);
922            let readback = ds.read_raw::<u8>().unwrap();
923            assert_eq!(readback, data);
924        }
925
926        std::fs::remove_file(&path).ok();
927    }
928
929    #[test]
930    fn roundtrip_i32_2d() {
931        let path = temp_path("rt_i32_2d");
932        let data: Vec<i32> = vec![-1, 0, 1, 2, 3, 4];
933
934        {
935            let file = H5File::create(&path).unwrap();
936            let ds = file
937                .new_dataset::<i32>()
938                .shape([2, 3])
939                .create("matrix")
940                .unwrap();
941            ds.write_raw(&data).unwrap();
942            file.close().unwrap();
943        }
944
945        {
946            let file = H5File::open(&path).unwrap();
947            let ds = file.dataset("matrix").unwrap();
948            assert_eq!(ds.shape(), vec![2, 3]);
949            let readback = ds.read_raw::<i32>().unwrap();
950            assert_eq!(readback, data);
951        }
952
953        std::fs::remove_file(&path).ok();
954    }
955
956    #[test]
957    fn roundtrip_f64_3d() {
958        let path = temp_path("rt_f64_3d");
959        let data: Vec<f64> = (0..24).map(|i| i as f64 * 0.5).collect();
960
961        {
962            let file = H5File::create(&path).unwrap();
963            let ds = file
964                .new_dataset::<f64>()
965                .shape([2, 3, 4])
966                .create("cube")
967                .unwrap();
968            ds.write_raw(&data).unwrap();
969            file.close().unwrap();
970        }
971
972        {
973            let file = H5File::open(&path).unwrap();
974            let ds = file.dataset("cube").unwrap();
975            assert_eq!(ds.shape(), vec![2, 3, 4]);
976            let readback = ds.read_raw::<f64>().unwrap();
977            assert_eq!(readback, data);
978        }
979
980        std::fs::remove_file(&path).ok();
981    }
982
983    #[test]
984    fn cannot_read_in_write_mode() {
985        let path = temp_path("no_read_write");
986        let file = H5File::create(&path).unwrap();
987        let ds = file.new_dataset::<u8>().shape([4]).create("x").unwrap();
988        ds.write_raw(&[1u8, 2, 3, 4]).unwrap();
989        let result = ds.read_raw::<u8>();
990        assert!(result.is_err());
991        std::fs::remove_file(&path).ok();
992    }
993
994    #[test]
995    fn cannot_write_in_read_mode() {
996        let path = temp_path("no_write_read");
997
998        {
999            let file = H5File::create(&path).unwrap();
1000            let ds = file.new_dataset::<u8>().shape([4]).create("x").unwrap();
1001            ds.write_raw(&[1u8, 2, 3, 4]).unwrap();
1002            file.close().unwrap();
1003        }
1004
1005        {
1006            let file = H5File::open(&path).unwrap();
1007            let ds = file.dataset("x").unwrap();
1008            let result = ds.write_raw(&[5u8, 6, 7, 8]);
1009            assert!(result.is_err());
1010        }
1011
1012        std::fs::remove_file(&path).ok();
1013    }
1014
1015    #[test]
1016    fn numeric_attr_roundtrip() {
1017        let path = temp_path("num_attr");
1018        {
1019            let file = H5File::create(&path).unwrap();
1020            let ds = file.new_dataset::<f32>().shape([4]).create("data").unwrap();
1021            ds.write_raw(&[1.0f32; 4]).unwrap();
1022
1023            let a1 = ds.new_attr::<f64>().shape(()).create("scale").unwrap();
1024            a1.write_numeric(&1.2345f64).unwrap();
1025
1026            let a2 = ds.new_attr::<i32>().shape(()).create("count").unwrap();
1027            a2.write_numeric(&42i32).unwrap();
1028
1029            file.close().unwrap();
1030        }
1031        {
1032            let file = H5File::open(&path).unwrap();
1033            let ds = file.dataset("data").unwrap();
1034
1035            let scale = ds.attr("scale").unwrap();
1036            let val: f64 = scale.read_numeric().unwrap();
1037            assert!((val - 1.2345).abs() < 1e-10);
1038
1039            let count = ds.attr("count").unwrap();
1040            let val: i32 = count.read_numeric().unwrap();
1041            assert_eq!(val, 42);
1042        }
1043        std::fs::remove_file(&path).ok();
1044    }
1045
1046    #[test]
1047    fn cannot_create_dataset_in_read_mode() {
1048        let path = temp_path("no_create_read");
1049
1050        {
1051            let _file = H5File::create(&path).unwrap();
1052        }
1053
1054        {
1055            let file = H5File::open(&path).unwrap();
1056            let result = file.new_dataset::<u8>().shape([4]).create("x");
1057            assert!(result.is_err());
1058        }
1059
1060        std::fs::remove_file(&path).ok();
1061    }
1062
1063    #[test]
1064    fn shape_accessor() {
1065        let path = temp_path("shape_acc");
1066
1067        let file = H5File::create(&path).unwrap();
1068        let ds = file
1069            .new_dataset::<f32>()
1070            .shape([5, 10, 3])
1071            .create("tensor")
1072            .unwrap();
1073        assert_eq!(ds.shape(), vec![5, 10, 3]);
1074
1075        std::fs::remove_file(&path).ok();
1076    }
1077
1078    #[test]
1079    fn slice_roundtrip_2d() {
1080        let path = temp_path("slice_2d");
1081
1082        // Create a 4x5 dataset, write full, then read a slice
1083        let data: Vec<i32> = (0..20).collect();
1084        {
1085            let file = H5File::create(&path).unwrap();
1086            let ds = file
1087                .new_dataset::<i32>()
1088                .shape([4, 5])
1089                .create("mat")
1090                .unwrap();
1091            ds.write_raw(&data).unwrap();
1092            file.close().unwrap();
1093        }
1094        {
1095            let file = H5File::open(&path).unwrap();
1096            let ds = file.dataset("mat").unwrap();
1097            // Read rows 1..3, cols 2..4 (2x2 slice)
1098            let slice = ds.read_slice::<i32>(&[1, 2], &[2, 2]).unwrap();
1099            // Row 1: [5,6,7,8,9] -> cols 2..4 = [7,8]
1100            // Row 2: [10,11,12,13,14] -> cols 2..4 = [12,13]
1101            assert_eq!(slice, vec![7, 8, 12, 13]);
1102        }
1103
1104        std::fs::remove_file(&path).ok();
1105    }
1106
1107    #[test]
1108    fn write_slice_2d() {
1109        let path = temp_path("write_slice_2d");
1110
1111        {
1112            let file = H5File::create(&path).unwrap();
1113            let ds = file
1114                .new_dataset::<f32>()
1115                .shape([3, 4])
1116                .create("data")
1117                .unwrap();
1118            ds.write_raw(&[0.0f32; 12]).unwrap();
1119            // Overwrite a 2x2 sub-region
1120            ds.write_slice(&[1, 1], &[2, 2], &[10.0f32, 20.0, 30.0, 40.0])
1121                .unwrap();
1122            file.close().unwrap();
1123        }
1124        {
1125            let file = H5File::open(&path).unwrap();
1126            let ds = file.dataset("data").unwrap();
1127            let full = ds.read_raw::<f32>().unwrap();
1128            // Row 0: [0,0,0,0]
1129            // Row 1: [0,10,20,0]
1130            // Row 2: [0,30,40,0]
1131            assert_eq!(
1132                full,
1133                vec![0.0, 0.0, 0.0, 0.0, 0.0, 10.0, 20.0, 0.0, 0.0, 30.0, 40.0, 0.0,]
1134            );
1135        }
1136
1137        std::fs::remove_file(&path).ok();
1138    }
1139
1140    #[test]
1141    fn attr_read_roundtrip() {
1142        use crate::types::VarLenUnicode;
1143        let path = temp_path("attr_read");
1144
1145        {
1146            let file = H5File::create(&path).unwrap();
1147            let ds = file.new_dataset::<u8>().shape([4]).create("data").unwrap();
1148            ds.write_raw(&[1u8, 2, 3, 4]).unwrap();
1149            let a1 = ds
1150                .new_attr::<VarLenUnicode>()
1151                .shape(())
1152                .create("units")
1153                .unwrap();
1154            a1.write_string("meters").unwrap();
1155            let a2 = ds
1156                .new_attr::<VarLenUnicode>()
1157                .shape(())
1158                .create("desc")
1159                .unwrap();
1160            a2.write_string("test data").unwrap();
1161            file.close().unwrap();
1162        }
1163        {
1164            let file = H5File::open(&path).unwrap();
1165            let ds = file.dataset("data").unwrap();
1166
1167            let names = ds.attr_names().unwrap();
1168            assert!(names.contains(&"units".to_string()));
1169            assert!(names.contains(&"desc".to_string()));
1170
1171            let units = ds.attr("units").unwrap();
1172            assert_eq!(units.read_string().unwrap(), "meters");
1173
1174            let desc = ds.attr("desc").unwrap();
1175            assert_eq!(desc.read_string().unwrap(), "test data");
1176        }
1177
1178        std::fs::remove_file(&path).ok();
1179    }
1180
1181    #[test]
1182    fn type_mismatch_element_size() {
1183        let path = temp_path("type_mismatch");
1184
1185        {
1186            let file = H5File::create(&path).unwrap();
1187            let ds = file.new_dataset::<f64>().shape([4]).create("data").unwrap();
1188            ds.write_raw(&[1.0f64, 2.0, 3.0, 4.0]).unwrap();
1189            file.close().unwrap();
1190        }
1191
1192        {
1193            let file = H5File::open(&path).unwrap();
1194            let ds = file.dataset("data").unwrap();
1195            // Try to read as u8 (element_size = 1) from a f64 dataset (element_size = 8)
1196            let result = ds.read_raw::<u8>();
1197            assert!(result.is_err());
1198        }
1199
1200        std::fs::remove_file(&path).ok();
1201    }
1202
1203    #[test]
1204    fn dataset_survives_file_move() {
1205        let path = temp_path("ds_survives");
1206
1207        let ds = {
1208            let file = H5File::create(&path).unwrap();
1209            file.new_dataset::<u8>().shape([4]).create("x").unwrap()
1210        };
1211        // file is dropped here, but ds still holds Rc to the inner state
1212        ds.write_raw(&[1u8, 2, 3, 4]).unwrap();
1213        // The writer will finalize on drop of the last Rc
1214
1215        std::fs::remove_file(&path).ok();
1216    }
1217
1218    #[test]
1219    fn new_attr_scalar_string() {
1220        use crate::types::VarLenUnicode;
1221
1222        let path = temp_path("attr_scalar_string");
1223        {
1224            let file = H5File::create(&path).unwrap();
1225            let ds = file.new_dataset::<u8>().shape([4]).create("data").unwrap();
1226            ds.write_raw(&[1u8, 2, 3, 4]).unwrap();
1227
1228            let attr = ds
1229                .new_attr::<VarLenUnicode>()
1230                .shape(())
1231                .create("name")
1232                .unwrap();
1233            attr.write_scalar(&VarLenUnicode("test_value".to_string()))
1234                .unwrap();
1235
1236            file.close().unwrap();
1237        }
1238
1239        // Verify the file is still valid and readable
1240        {
1241            let file = H5File::open(&path).unwrap();
1242            let ds = file.dataset("data").unwrap();
1243            assert_eq!(ds.shape(), vec![4]);
1244            let readback = ds.read_raw::<u8>().unwrap();
1245            assert_eq!(readback, vec![1u8, 2, 3, 4]);
1246        }
1247
1248        std::fs::remove_file(&path).ok();
1249    }
1250
1251    #[test]
1252    fn all_numeric_types_roundtrip() {
1253        let path = temp_path("all_types");
1254
1255        {
1256            let file = H5File::create(&path).unwrap();
1257
1258            let ds = file.new_dataset::<u8>().shape([2]).create("u8").unwrap();
1259            ds.write_raw(&[1u8, 2]).unwrap();
1260
1261            let ds = file.new_dataset::<i8>().shape([2]).create("i8").unwrap();
1262            ds.write_raw(&[-1i8, 1]).unwrap();
1263
1264            let ds = file.new_dataset::<u16>().shape([2]).create("u16").unwrap();
1265            ds.write_raw(&[100u16, 200]).unwrap();
1266
1267            let ds = file.new_dataset::<i16>().shape([2]).create("i16").unwrap();
1268            ds.write_raw(&[-100i16, 100]).unwrap();
1269
1270            let ds = file.new_dataset::<u32>().shape([2]).create("u32").unwrap();
1271            ds.write_raw(&[1000u32, 2000]).unwrap();
1272
1273            let ds = file.new_dataset::<i32>().shape([2]).create("i32").unwrap();
1274            ds.write_raw(&[-1000i32, 1000]).unwrap();
1275
1276            let ds = file.new_dataset::<u64>().shape([2]).create("u64").unwrap();
1277            ds.write_raw(&[10000u64, 20000]).unwrap();
1278
1279            let ds = file.new_dataset::<i64>().shape([2]).create("i64").unwrap();
1280            ds.write_raw(&[-10000i64, 10000]).unwrap();
1281
1282            let ds = file.new_dataset::<f32>().shape([2]).create("f32").unwrap();
1283            ds.write_raw(&[1.5f32, 2.5]).unwrap();
1284
1285            let ds = file.new_dataset::<f64>().shape([2]).create("f64").unwrap();
1286            ds.write_raw(&[1.23456f64, 7.89012]).unwrap();
1287
1288            file.close().unwrap();
1289        }
1290
1291        {
1292            let file = H5File::open(&path).unwrap();
1293
1294            assert_eq!(
1295                file.dataset("u8").unwrap().read_raw::<u8>().unwrap(),
1296                vec![1u8, 2]
1297            );
1298            assert_eq!(
1299                file.dataset("i8").unwrap().read_raw::<i8>().unwrap(),
1300                vec![-1i8, 1]
1301            );
1302            assert_eq!(
1303                file.dataset("u16").unwrap().read_raw::<u16>().unwrap(),
1304                vec![100u16, 200]
1305            );
1306            assert_eq!(
1307                file.dataset("i16").unwrap().read_raw::<i16>().unwrap(),
1308                vec![-100i16, 100]
1309            );
1310            assert_eq!(
1311                file.dataset("u32").unwrap().read_raw::<u32>().unwrap(),
1312                vec![1000u32, 2000]
1313            );
1314            assert_eq!(
1315                file.dataset("i32").unwrap().read_raw::<i32>().unwrap(),
1316                vec![-1000i32, 1000]
1317            );
1318            assert_eq!(
1319                file.dataset("u64").unwrap().read_raw::<u64>().unwrap(),
1320                vec![10000u64, 20000]
1321            );
1322            assert_eq!(
1323                file.dataset("i64").unwrap().read_raw::<i64>().unwrap(),
1324                vec![-10000i64, 10000]
1325            );
1326            assert_eq!(
1327                file.dataset("f32").unwrap().read_raw::<f32>().unwrap(),
1328                vec![1.5f32, 2.5]
1329            );
1330            assert_eq!(
1331                file.dataset("f64").unwrap().read_raw::<f64>().unwrap(),
1332                vec![1.23456f64, 7.89012]
1333            );
1334        }
1335
1336        std::fs::remove_file(&path).ok();
1337    }
1338}