#[cfg(feature = "formats")]
use crate::error::{DatasetsError, Result};
#[cfg(feature = "formats")]
use crate::utils::Dataset;
#[cfg(feature = "formats")]
use scirs2_core::ndarray::{Array1, Array2};
#[cfg(feature = "formats")]
use std::path::Path;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FormatType {
Parquet,
Arrow,
Hdf5,
Csv,
}
impl FormatType {
pub fn from_extension(path: &str) -> Option<Self> {
let lower = path.to_lowercase();
if lower.ends_with(".parquet") || lower.ends_with(".pq") {
Some(FormatType::Parquet)
} else if lower.ends_with(".arrow") {
Some(FormatType::Arrow)
} else if lower.ends_with(".h5") || lower.ends_with(".hdf5") {
Some(FormatType::Hdf5)
} else if lower.ends_with(".csv") {
Some(FormatType::Csv)
} else {
None
}
}
pub fn extension(&self) -> &'static str {
match self {
FormatType::Parquet => "parquet",
FormatType::Arrow => "arrow",
FormatType::Hdf5 => "h5",
FormatType::Csv => "csv",
}
}
}
#[derive(Debug, Clone)]
pub struct FormatConfig {
pub chunk_size: usize,
pub compression: Option<CompressionCodec>,
pub use_mmap: bool,
pub buffer_size: usize,
}
impl Default for FormatConfig {
fn default() -> Self {
Self {
chunk_size: 10_000,
compression: Some(CompressionCodec::Snappy),
use_mmap: true,
buffer_size: 8 * 1024 * 1024, }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompressionCodec {
None,
Snappy,
Gzip,
Lz4,
Zstd,
}
impl CompressionCodec {
pub fn level(&self) -> Option<i32> {
match self {
CompressionCodec::None | CompressionCodec::Snappy | CompressionCodec::Lz4 => None,
CompressionCodec::Gzip => Some(6), CompressionCodec::Zstd => Some(3), }
}
}
#[cfg(feature = "formats")]
pub struct ParquetReader {
config: FormatConfig,
}
#[cfg(feature = "formats")]
impl ParquetReader {
pub fn new() -> Self {
Self {
config: FormatConfig::default(),
}
}
pub fn with_config(config: FormatConfig) -> Self {
Self { config }
}
pub fn read<P: AsRef<Path>>(&self, _path: P) -> Result<Dataset> {
Err(DatasetsError::InvalidFormat(
"Parquet reading requires scirs2-io parquet feature (in development)".to_string(),
))
}
}
#[cfg(feature = "formats")]
impl Default for ParquetReader {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "formats")]
pub struct ParquetWriter {
config: FormatConfig,
}
#[cfg(feature = "formats")]
impl ParquetWriter {
pub fn new() -> Self {
Self {
config: FormatConfig::default(),
}
}
pub fn with_config(config: FormatConfig) -> Self {
Self { config }
}
pub fn write<P: AsRef<Path>>(&self, _dataset: &Dataset, _path: P) -> Result<()> {
Err(DatasetsError::InvalidFormat(
"Parquet writing requires scirs2-io parquet feature (in development)".to_string(),
))
}
}
#[cfg(feature = "formats")]
impl Default for ParquetWriter {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "formats")]
pub struct Hdf5Reader {
config: FormatConfig,
}
#[cfg(feature = "formats")]
impl Hdf5Reader {
pub fn new() -> Self {
Self {
config: FormatConfig::default(),
}
}
pub fn with_config(config: FormatConfig) -> Self {
Self { config }
}
pub fn read<P: AsRef<Path>>(&self, _path: P, _dataset_name: &str) -> Result<Dataset> {
Err(DatasetsError::InvalidFormat(
"HDF5 reading requires scirs2-io hdf5 feature (in development)".to_string(),
))
}
}
#[cfg(feature = "formats")]
impl Default for Hdf5Reader {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "formats")]
pub struct Hdf5Writer {
config: FormatConfig,
}
#[cfg(feature = "formats")]
impl Hdf5Writer {
pub fn new() -> Self {
Self {
config: FormatConfig::default(),
}
}
pub fn with_config(config: FormatConfig) -> Self {
Self { config }
}
pub fn write<P: AsRef<Path>>(
&self,
_dataset: &Dataset,
_path: P,
_dataset_name: &str,
) -> Result<()> {
Err(DatasetsError::InvalidFormat(
"HDF5 writing requires scirs2-io hdf5 feature (in development)".to_string(),
))
}
}
#[cfg(feature = "formats")]
impl Default for Hdf5Writer {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "formats")]
pub struct FormatConverter {
config: FormatConfig,
}
#[cfg(feature = "formats")]
impl FormatConverter {
pub fn new() -> Self {
Self {
config: FormatConfig::default(),
}
}
pub fn convert<P1: AsRef<Path>, P2: AsRef<Path>>(
&self,
input_path: P1,
input_format: FormatType,
output_path: P2,
output_format: FormatType,
) -> Result<()> {
let dataset = match input_format {
FormatType::Parquet => ParquetReader::new().read(input_path)?,
FormatType::Hdf5 => Hdf5Reader::new().read(input_path, "data")?,
FormatType::Csv => {
return Err(DatasetsError::InvalidFormat(
"CSV reading via format converter not yet implemented".to_string(),
))
}
FormatType::Arrow => {
return Err(DatasetsError::InvalidFormat(
"Arrow format not yet supported".to_string(),
))
}
};
match output_format {
FormatType::Parquet => ParquetWriter::new().write(&dataset, output_path)?,
FormatType::Hdf5 => Hdf5Writer::new().write(&dataset, output_path, "data")?,
FormatType::Csv => {
return Err(DatasetsError::InvalidFormat(
"CSV writing via format converter not yet implemented".to_string(),
))
}
FormatType::Arrow => {
return Err(DatasetsError::InvalidFormat(
"Arrow format not yet supported".to_string(),
))
}
}
Ok(())
}
pub fn read_auto<P: AsRef<Path>>(&self, path: P) -> Result<Dataset> {
let path_str = path
.as_ref()
.to_str()
.ok_or_else(|| DatasetsError::InvalidFormat("Invalid path".to_string()))?;
let format = FormatType::from_extension(path_str)
.ok_or_else(|| DatasetsError::InvalidFormat("Could not detect format".to_string()))?;
match format {
FormatType::Parquet => ParquetReader::new().read(path),
FormatType::Hdf5 => Hdf5Reader::new().read(path, "data"),
_ => Err(DatasetsError::InvalidFormat(format!(
"Unsupported format: {:?}",
format
))),
}
}
}
#[cfg(feature = "formats")]
impl Default for FormatConverter {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "formats")]
pub fn read_parquet<P: AsRef<Path>>(path: P) -> Result<Dataset> {
ParquetReader::new().read(path)
}
#[cfg(feature = "formats")]
pub fn write_parquet<P: AsRef<Path>>(dataset: &Dataset, path: P) -> Result<()> {
ParquetWriter::new().write(dataset, path)
}
#[cfg(feature = "formats")]
pub fn read_hdf5<P: AsRef<Path>>(path: P, dataset_name: &str) -> Result<Dataset> {
Hdf5Reader::new().read(path, dataset_name)
}
#[cfg(feature = "formats")]
pub fn write_hdf5<P: AsRef<Path>>(dataset: &Dataset, path: P, dataset_name: &str) -> Result<()> {
Hdf5Writer::new().write(dataset, path, dataset_name)
}
#[cfg(feature = "formats")]
pub fn read_auto<P: AsRef<Path>>(path: P) -> Result<Dataset> {
FormatConverter::new().read_auto(path)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_format_detection() {
assert_eq!(
FormatType::from_extension("data.parquet"),
Some(FormatType::Parquet)
);
assert_eq!(
FormatType::from_extension("data.h5"),
Some(FormatType::Hdf5)
);
assert_eq!(
FormatType::from_extension("data.csv"),
Some(FormatType::Csv)
);
assert_eq!(FormatType::from_extension("data.txt"), None);
}
#[test]
fn test_format_extension() {
assert_eq!(FormatType::Parquet.extension(), "parquet");
assert_eq!(FormatType::Hdf5.extension(), "h5");
assert_eq!(FormatType::Csv.extension(), "csv");
}
#[test]
fn test_compression_codec() {
assert_eq!(CompressionCodec::None.level(), None);
assert_eq!(CompressionCodec::Snappy.level(), None);
assert_eq!(CompressionCodec::Gzip.level(), Some(6));
assert_eq!(CompressionCodec::Zstd.level(), Some(3));
}
#[test]
fn test_format_config() {
let config = FormatConfig::default();
assert_eq!(config.chunk_size, 10_000);
assert_eq!(config.compression, Some(CompressionCodec::Snappy));
assert!(config.use_mmap);
}
}