1#[cfg(feature = "formats")]
8use crate::error::{DatasetsError, Result};
9#[cfg(feature = "formats")]
10use crate::utils::Dataset;
11#[cfg(feature = "formats")]
12use scirs2_core::ndarray::{Array1, Array2};
13#[cfg(feature = "formats")]
14use std::path::Path;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum FormatType {
19 Parquet,
21 Arrow,
23 Hdf5,
25 Csv,
27}
28
29impl FormatType {
30 pub fn from_extension(path: &str) -> Option<Self> {
32 let lower = path.to_lowercase();
33 if lower.ends_with(".parquet") || lower.ends_with(".pq") {
34 Some(FormatType::Parquet)
35 } else if lower.ends_with(".arrow") {
36 Some(FormatType::Arrow)
37 } else if lower.ends_with(".h5") || lower.ends_with(".hdf5") {
38 Some(FormatType::Hdf5)
39 } else if lower.ends_with(".csv") {
40 Some(FormatType::Csv)
41 } else {
42 None
43 }
44 }
45
46 pub fn extension(&self) -> &'static str {
48 match self {
49 FormatType::Parquet => "parquet",
50 FormatType::Arrow => "arrow",
51 FormatType::Hdf5 => "h5",
52 FormatType::Csv => "csv",
53 }
54 }
55}
56
57#[derive(Debug, Clone)]
59pub struct FormatConfig {
60 pub chunk_size: usize,
62 pub compression: Option<CompressionCodec>,
64 pub use_mmap: bool,
66 pub buffer_size: usize,
68}
69
70impl Default for FormatConfig {
71 fn default() -> Self {
72 Self {
73 chunk_size: 10_000,
74 compression: Some(CompressionCodec::Snappy),
75 use_mmap: true,
76 buffer_size: 8 * 1024 * 1024, }
78 }
79}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq)]
83pub enum CompressionCodec {
84 None,
86 Snappy,
88 Gzip,
90 Lz4,
92 Zstd,
94}
95
96impl CompressionCodec {
97 pub fn level(&self) -> Option<i32> {
99 match self {
100 CompressionCodec::None | CompressionCodec::Snappy | CompressionCodec::Lz4 => None,
101 CompressionCodec::Gzip => Some(6), CompressionCodec::Zstd => Some(3), }
104 }
105}
106
107#[cfg(feature = "formats")]
112pub struct ParquetReader {
114 config: FormatConfig,
115}
116
117#[cfg(feature = "formats")]
118impl ParquetReader {
119 pub fn new() -> Self {
121 Self {
122 config: FormatConfig::default(),
123 }
124 }
125
126 pub fn with_config(config: FormatConfig) -> Self {
128 Self { config }
129 }
130
131 pub fn read<P: AsRef<Path>>(&self, _path: P) -> Result<Dataset> {
136 Err(DatasetsError::InvalidFormat(
139 "Parquet reading requires scirs2-io parquet feature (in development)".to_string(),
140 ))
141 }
142}
143
144#[cfg(feature = "formats")]
145impl Default for ParquetReader {
146 fn default() -> Self {
147 Self::new()
148 }
149}
150
151#[cfg(feature = "formats")]
152pub struct ParquetWriter {
154 config: FormatConfig,
155}
156
157#[cfg(feature = "formats")]
158impl ParquetWriter {
159 pub fn new() -> Self {
161 Self {
162 config: FormatConfig::default(),
163 }
164 }
165
166 pub fn with_config(config: FormatConfig) -> Self {
168 Self { config }
169 }
170
171 pub fn write<P: AsRef<Path>>(&self, _dataset: &Dataset, _path: P) -> Result<()> {
173 Err(DatasetsError::InvalidFormat(
175 "Parquet writing requires scirs2-io parquet feature (in development)".to_string(),
176 ))
177 }
178}
179
180#[cfg(feature = "formats")]
181impl Default for ParquetWriter {
182 fn default() -> Self {
183 Self::new()
184 }
185}
186
187#[cfg(feature = "formats")]
192pub struct Hdf5Reader {
194 config: FormatConfig,
195}
196
197#[cfg(feature = "formats")]
198impl Hdf5Reader {
199 pub fn new() -> Self {
201 Self {
202 config: FormatConfig::default(),
203 }
204 }
205
206 pub fn with_config(config: FormatConfig) -> Self {
208 Self { config }
209 }
210
211 pub fn read<P: AsRef<Path>>(&self, _path: P, _dataset_name: &str) -> Result<Dataset> {
213 Err(DatasetsError::InvalidFormat(
215 "HDF5 reading requires scirs2-io hdf5 feature (in development)".to_string(),
216 ))
217 }
218}
219
220#[cfg(feature = "formats")]
221impl Default for Hdf5Reader {
222 fn default() -> Self {
223 Self::new()
224 }
225}
226
227#[cfg(feature = "formats")]
228pub struct Hdf5Writer {
230 config: FormatConfig,
231}
232
233#[cfg(feature = "formats")]
234impl Hdf5Writer {
235 pub fn new() -> Self {
237 Self {
238 config: FormatConfig::default(),
239 }
240 }
241
242 pub fn with_config(config: FormatConfig) -> Self {
244 Self { config }
245 }
246
247 pub fn write<P: AsRef<Path>>(
249 &self,
250 _dataset: &Dataset,
251 _path: P,
252 _dataset_name: &str,
253 ) -> Result<()> {
254 Err(DatasetsError::InvalidFormat(
256 "HDF5 writing requires scirs2-io hdf5 feature (in development)".to_string(),
257 ))
258 }
259}
260
261#[cfg(feature = "formats")]
262impl Default for Hdf5Writer {
263 fn default() -> Self {
264 Self::new()
265 }
266}
267
268#[cfg(feature = "formats")]
273pub struct FormatConverter {
275 config: FormatConfig,
276}
277
278#[cfg(feature = "formats")]
279impl FormatConverter {
280 pub fn new() -> Self {
282 Self {
283 config: FormatConfig::default(),
284 }
285 }
286
287 pub fn convert<P1: AsRef<Path>, P2: AsRef<Path>>(
289 &self,
290 input_path: P1,
291 input_format: FormatType,
292 output_path: P2,
293 output_format: FormatType,
294 ) -> Result<()> {
295 let dataset = match input_format {
297 FormatType::Parquet => ParquetReader::new().read(input_path)?,
298 FormatType::Hdf5 => Hdf5Reader::new().read(input_path, "data")?,
299 FormatType::Csv => {
300 return Err(DatasetsError::InvalidFormat(
301 "CSV reading via format converter not yet implemented".to_string(),
302 ))
303 }
304 FormatType::Arrow => {
305 return Err(DatasetsError::InvalidFormat(
306 "Arrow format not yet supported".to_string(),
307 ))
308 }
309 };
310
311 match output_format {
313 FormatType::Parquet => ParquetWriter::new().write(&dataset, output_path)?,
314 FormatType::Hdf5 => Hdf5Writer::new().write(&dataset, output_path, "data")?,
315 FormatType::Csv => {
316 return Err(DatasetsError::InvalidFormat(
317 "CSV writing via format converter not yet implemented".to_string(),
318 ))
319 }
320 FormatType::Arrow => {
321 return Err(DatasetsError::InvalidFormat(
322 "Arrow format not yet supported".to_string(),
323 ))
324 }
325 }
326
327 Ok(())
328 }
329
330 pub fn read_auto<P: AsRef<Path>>(&self, path: P) -> Result<Dataset> {
332 let path_str = path
333 .as_ref()
334 .to_str()
335 .ok_or_else(|| DatasetsError::InvalidFormat("Invalid path".to_string()))?;
336
337 let format = FormatType::from_extension(path_str)
338 .ok_or_else(|| DatasetsError::InvalidFormat("Could not detect format".to_string()))?;
339
340 match format {
341 FormatType::Parquet => ParquetReader::new().read(path),
342 FormatType::Hdf5 => Hdf5Reader::new().read(path, "data"),
343 _ => Err(DatasetsError::InvalidFormat(format!(
344 "Unsupported format: {:?}",
345 format
346 ))),
347 }
348 }
349}
350
351#[cfg(feature = "formats")]
352impl Default for FormatConverter {
353 fn default() -> Self {
354 Self::new()
355 }
356}
357
358#[cfg(feature = "formats")]
364pub fn read_parquet<P: AsRef<Path>>(path: P) -> Result<Dataset> {
365 ParquetReader::new().read(path)
366}
367
368#[cfg(feature = "formats")]
370pub fn write_parquet<P: AsRef<Path>>(dataset: &Dataset, path: P) -> Result<()> {
371 ParquetWriter::new().write(dataset, path)
372}
373
374#[cfg(feature = "formats")]
376pub fn read_hdf5<P: AsRef<Path>>(path: P, dataset_name: &str) -> Result<Dataset> {
377 Hdf5Reader::new().read(path, dataset_name)
378}
379
380#[cfg(feature = "formats")]
382pub fn write_hdf5<P: AsRef<Path>>(dataset: &Dataset, path: P, dataset_name: &str) -> Result<()> {
383 Hdf5Writer::new().write(dataset, path, dataset_name)
384}
385
386#[cfg(feature = "formats")]
388pub fn read_auto<P: AsRef<Path>>(path: P) -> Result<Dataset> {
389 FormatConverter::new().read_auto(path)
390}
391
392#[cfg(test)]
393mod tests {
394 use super::*;
395
396 #[test]
397 fn test_format_detection() {
398 assert_eq!(
399 FormatType::from_extension("data.parquet"),
400 Some(FormatType::Parquet)
401 );
402 assert_eq!(
403 FormatType::from_extension("data.h5"),
404 Some(FormatType::Hdf5)
405 );
406 assert_eq!(
407 FormatType::from_extension("data.csv"),
408 Some(FormatType::Csv)
409 );
410 assert_eq!(FormatType::from_extension("data.txt"), None);
411 }
412
413 #[test]
414 fn test_format_extension() {
415 assert_eq!(FormatType::Parquet.extension(), "parquet");
416 assert_eq!(FormatType::Hdf5.extension(), "h5");
417 assert_eq!(FormatType::Csv.extension(), "csv");
418 }
419
420 #[test]
421 fn test_compression_codec() {
422 assert_eq!(CompressionCodec::None.level(), None);
423 assert_eq!(CompressionCodec::Snappy.level(), None);
424 assert_eq!(CompressionCodec::Gzip.level(), Some(6));
425 assert_eq!(CompressionCodec::Zstd.level(), Some(3));
426 }
427
428 #[test]
429 fn test_format_config() {
430 let config = FormatConfig::default();
431 assert_eq!(config.chunk_size, 10_000);
432 assert_eq!(config.compression, Some(CompressionCodec::Snappy));
433 assert!(config.use_mmap);
434 }
435}