Skip to main content

scirs2_datasets/
formats.rs

1//! Support for various data formats (Parquet, Arrow, HDF5)
2//!
3//! This module provides integration with scirs2-io for reading and writing
4//! datasets in modern columnar formats like Parquet and Arrow, as well as
5//! scientific formats like HDF5, with memory-efficient streaming support.
6
7#[cfg(feature = "formats")]
8use crate::error::{DatasetsError, Result};
9#[cfg(feature = "formats")]
10use crate::utils::Dataset;
11#[cfg(feature = "formats")]
12use scirs2_core::ndarray::{Array1, Array2};
13#[cfg(feature = "formats")]
14use std::path::Path;
15
16/// Format type enumeration
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum FormatType {
19    /// Apache Parquet columnar format
20    Parquet,
21    /// Apache Arrow in-memory format
22    Arrow,
23    /// HDF5 hierarchical format
24    Hdf5,
25    /// CSV format (for completeness)
26    Csv,
27}
28
29impl FormatType {
30    /// Detect format from file extension
31    pub fn from_extension(path: &str) -> Option<Self> {
32        let lower = path.to_lowercase();
33        if lower.ends_with(".parquet") || lower.ends_with(".pq") {
34            Some(FormatType::Parquet)
35        } else if lower.ends_with(".arrow") {
36            Some(FormatType::Arrow)
37        } else if lower.ends_with(".h5") || lower.ends_with(".hdf5") {
38            Some(FormatType::Hdf5)
39        } else if lower.ends_with(".csv") {
40            Some(FormatType::Csv)
41        } else {
42            None
43        }
44    }
45
46    /// Get file extension for this format
47    pub fn extension(&self) -> &'static str {
48        match self {
49            FormatType::Parquet => "parquet",
50            FormatType::Arrow => "arrow",
51            FormatType::Hdf5 => "h5",
52            FormatType::Csv => "csv",
53        }
54    }
55}
56
57/// Configuration for format conversion
58#[derive(Debug, Clone)]
59pub struct FormatConfig {
60    /// Chunk size for streaming operations
61    pub chunk_size: usize,
62    /// Compression codec
63    pub compression: Option<CompressionCodec>,
64    /// Whether to use memory mapping when possible
65    pub use_mmap: bool,
66    /// Buffer size for I/O operations
67    pub buffer_size: usize,
68}
69
70impl Default for FormatConfig {
71    fn default() -> Self {
72        Self {
73            chunk_size: 10_000,
74            compression: Some(CompressionCodec::Snappy),
75            use_mmap: true,
76            buffer_size: 8 * 1024 * 1024, // 8 MB
77        }
78    }
79}
80
81/// Compression codec options
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
83pub enum CompressionCodec {
84    /// No compression
85    None,
86    /// Snappy compression
87    Snappy,
88    /// GZIP compression
89    Gzip,
90    /// LZ4 compression
91    Lz4,
92    /// ZSTD compression
93    Zstd,
94}
95
96impl CompressionCodec {
97    /// Get compression level (0-9 where applicable)
98    pub fn level(&self) -> Option<i32> {
99        match self {
100            CompressionCodec::None | CompressionCodec::Snappy | CompressionCodec::Lz4 => None,
101            CompressionCodec::Gzip => Some(6), // Default GZIP level
102            CompressionCodec::Zstd => Some(3), // Default ZSTD level
103        }
104    }
105}
106
107// ============================================================================
108// Parquet Support (when formats feature is enabled)
109// ============================================================================
110
111#[cfg(feature = "formats")]
112/// Parquet reader for datasets
113pub struct ParquetReader {
114    config: FormatConfig,
115}
116
117#[cfg(feature = "formats")]
118impl ParquetReader {
119    /// Create a new Parquet reader
120    pub fn new() -> Self {
121        Self {
122            config: FormatConfig::default(),
123        }
124    }
125
126    /// Create a Parquet reader with custom configuration
127    pub fn with_config(config: FormatConfig) -> Self {
128        Self { config }
129    }
130
131    /// Read a Parquet file into a Dataset
132    ///
133    /// Note: This is a stub implementation. Full integration with scirs2-io
134    /// would require coordinating with the scirs2-io Parquet implementation.
135    pub fn read<P: AsRef<Path>>(&self, _path: P) -> Result<Dataset> {
136        // TODO: Implement actual Parquet reading via scirs2-io
137        // For now, return an error indicating feature is in development
138        Err(DatasetsError::InvalidFormat(
139            "Parquet reading requires scirs2-io parquet feature (in development)".to_string(),
140        ))
141    }
142}
143
144#[cfg(feature = "formats")]
145impl Default for ParquetReader {
146    fn default() -> Self {
147        Self::new()
148    }
149}
150
151#[cfg(feature = "formats")]
152/// Parquet writer for datasets
153pub struct ParquetWriter {
154    config: FormatConfig,
155}
156
157#[cfg(feature = "formats")]
158impl ParquetWriter {
159    /// Create a new Parquet writer
160    pub fn new() -> Self {
161        Self {
162            config: FormatConfig::default(),
163        }
164    }
165
166    /// Create a Parquet writer with custom configuration
167    pub fn with_config(config: FormatConfig) -> Self {
168        Self { config }
169    }
170
171    /// Write a Dataset to a Parquet file
172    pub fn write<P: AsRef<Path>>(&self, _dataset: &Dataset, _path: P) -> Result<()> {
173        // TODO: Implement actual Parquet writing via scirs2-io
174        Err(DatasetsError::InvalidFormat(
175            "Parquet writing requires scirs2-io parquet feature (in development)".to_string(),
176        ))
177    }
178}
179
180#[cfg(feature = "formats")]
181impl Default for ParquetWriter {
182    fn default() -> Self {
183        Self::new()
184    }
185}
186
187// ============================================================================
188// HDF5 Support
189// ============================================================================
190
191#[cfg(feature = "formats")]
192/// HDF5 reader for datasets
193pub struct Hdf5Reader {
194    config: FormatConfig,
195}
196
197#[cfg(feature = "formats")]
198impl Hdf5Reader {
199    /// Create a new HDF5 reader
200    pub fn new() -> Self {
201        Self {
202            config: FormatConfig::default(),
203        }
204    }
205
206    /// Create an HDF5 reader with custom configuration
207    pub fn with_config(config: FormatConfig) -> Self {
208        Self { config }
209    }
210
211    /// Read an HDF5 file into a Dataset
212    pub fn read<P: AsRef<Path>>(&self, _path: P, _dataset_name: &str) -> Result<Dataset> {
213        // TODO: Implement HDF5 reading via scirs2-io
214        Err(DatasetsError::InvalidFormat(
215            "HDF5 reading requires scirs2-io hdf5 feature (in development)".to_string(),
216        ))
217    }
218}
219
220#[cfg(feature = "formats")]
221impl Default for Hdf5Reader {
222    fn default() -> Self {
223        Self::new()
224    }
225}
226
227#[cfg(feature = "formats")]
228/// HDF5 writer for datasets
229pub struct Hdf5Writer {
230    config: FormatConfig,
231}
232
233#[cfg(feature = "formats")]
234impl Hdf5Writer {
235    /// Create a new HDF5 writer
236    pub fn new() -> Self {
237        Self {
238            config: FormatConfig::default(),
239        }
240    }
241
242    /// Create an HDF5 writer with custom configuration
243    pub fn with_config(config: FormatConfig) -> Self {
244        Self { config }
245    }
246
247    /// Write a Dataset to an HDF5 file
248    pub fn write<P: AsRef<Path>>(
249        &self,
250        _dataset: &Dataset,
251        _path: P,
252        _dataset_name: &str,
253    ) -> Result<()> {
254        // TODO: Implement HDF5 writing via scirs2-io
255        Err(DatasetsError::InvalidFormat(
256            "HDF5 writing requires scirs2-io hdf5 feature (in development)".to_string(),
257        ))
258    }
259}
260
261#[cfg(feature = "formats")]
262impl Default for Hdf5Writer {
263    fn default() -> Self {
264        Self::new()
265    }
266}
267
268// ============================================================================
269// Format Conversion
270// ============================================================================
271
272#[cfg(feature = "formats")]
273/// Convert between different data formats
274pub struct FormatConverter {
275    config: FormatConfig,
276}
277
278#[cfg(feature = "formats")]
279impl FormatConverter {
280    /// Create a new format converter
281    pub fn new() -> Self {
282        Self {
283            config: FormatConfig::default(),
284        }
285    }
286
287    /// Convert a dataset from one format to another
288    pub fn convert<P1: AsRef<Path>, P2: AsRef<Path>>(
289        &self,
290        input_path: P1,
291        input_format: FormatType,
292        output_path: P2,
293        output_format: FormatType,
294    ) -> Result<()> {
295        // Read in input format
296        let dataset = match input_format {
297            FormatType::Parquet => ParquetReader::new().read(input_path)?,
298            FormatType::Hdf5 => Hdf5Reader::new().read(input_path, "data")?,
299            FormatType::Csv => {
300                return Err(DatasetsError::InvalidFormat(
301                    "CSV reading via format converter not yet implemented".to_string(),
302                ))
303            }
304            FormatType::Arrow => {
305                return Err(DatasetsError::InvalidFormat(
306                    "Arrow format not yet supported".to_string(),
307                ))
308            }
309        };
310
311        // Write in output format
312        match output_format {
313            FormatType::Parquet => ParquetWriter::new().write(&dataset, output_path)?,
314            FormatType::Hdf5 => Hdf5Writer::new().write(&dataset, output_path, "data")?,
315            FormatType::Csv => {
316                return Err(DatasetsError::InvalidFormat(
317                    "CSV writing via format converter not yet implemented".to_string(),
318                ))
319            }
320            FormatType::Arrow => {
321                return Err(DatasetsError::InvalidFormat(
322                    "Arrow format not yet supported".to_string(),
323                ))
324            }
325        }
326
327        Ok(())
328    }
329
330    /// Auto-detect format and read
331    pub fn read_auto<P: AsRef<Path>>(&self, path: P) -> Result<Dataset> {
332        let path_str = path
333            .as_ref()
334            .to_str()
335            .ok_or_else(|| DatasetsError::InvalidFormat("Invalid path".to_string()))?;
336
337        let format = FormatType::from_extension(path_str)
338            .ok_or_else(|| DatasetsError::InvalidFormat("Could not detect format".to_string()))?;
339
340        match format {
341            FormatType::Parquet => ParquetReader::new().read(path),
342            FormatType::Hdf5 => Hdf5Reader::new().read(path, "data"),
343            _ => Err(DatasetsError::InvalidFormat(format!(
344                "Unsupported format: {:?}",
345                format
346            ))),
347        }
348    }
349}
350
351#[cfg(feature = "formats")]
352impl Default for FormatConverter {
353    fn default() -> Self {
354        Self::new()
355    }
356}
357
358// ============================================================================
359// Convenience Functions
360// ============================================================================
361
362/// Read a Parquet file
363#[cfg(feature = "formats")]
364pub fn read_parquet<P: AsRef<Path>>(path: P) -> Result<Dataset> {
365    ParquetReader::new().read(path)
366}
367
368/// Write a Parquet file
369#[cfg(feature = "formats")]
370pub fn write_parquet<P: AsRef<Path>>(dataset: &Dataset, path: P) -> Result<()> {
371    ParquetWriter::new().write(dataset, path)
372}
373
374/// Read an HDF5 file
375#[cfg(feature = "formats")]
376pub fn read_hdf5<P: AsRef<Path>>(path: P, dataset_name: &str) -> Result<Dataset> {
377    Hdf5Reader::new().read(path, dataset_name)
378}
379
380/// Write an HDF5 file
381#[cfg(feature = "formats")]
382pub fn write_hdf5<P: AsRef<Path>>(dataset: &Dataset, path: P, dataset_name: &str) -> Result<()> {
383    Hdf5Writer::new().write(dataset, path, dataset_name)
384}
385
386/// Auto-detect format and read
387#[cfg(feature = "formats")]
388pub fn read_auto<P: AsRef<Path>>(path: P) -> Result<Dataset> {
389    FormatConverter::new().read_auto(path)
390}
391
392#[cfg(test)]
393mod tests {
394    use super::*;
395
396    #[test]
397    fn test_format_detection() {
398        assert_eq!(
399            FormatType::from_extension("data.parquet"),
400            Some(FormatType::Parquet)
401        );
402        assert_eq!(
403            FormatType::from_extension("data.h5"),
404            Some(FormatType::Hdf5)
405        );
406        assert_eq!(
407            FormatType::from_extension("data.csv"),
408            Some(FormatType::Csv)
409        );
410        assert_eq!(FormatType::from_extension("data.txt"), None);
411    }
412
413    #[test]
414    fn test_format_extension() {
415        assert_eq!(FormatType::Parquet.extension(), "parquet");
416        assert_eq!(FormatType::Hdf5.extension(), "h5");
417        assert_eq!(FormatType::Csv.extension(), "csv");
418    }
419
420    #[test]
421    fn test_compression_codec() {
422        assert_eq!(CompressionCodec::None.level(), None);
423        assert_eq!(CompressionCodec::Snappy.level(), None);
424        assert_eq!(CompressionCodec::Gzip.level(), Some(6));
425        assert_eq!(CompressionCodec::Zstd.level(), Some(3));
426    }
427
428    #[test]
429    fn test_format_config() {
430        let config = FormatConfig::default();
431        assert_eq!(config.chunk_size, 10_000);
432        assert_eq!(config.compression, Some(CompressionCodec::Snappy));
433        assert!(config.use_mmap);
434    }
435}