use thiserror::Error;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ParquetType {
Boolean,
Int32,
Int64,
Int96,
Float,
Double,
ByteArray,
FixedLenByteArray(i32),
}
impl ParquetType {
pub fn from_thrift(code: i32) -> Result<Self> {
match code {
0 => Ok(ParquetType::Boolean),
1 => Ok(ParquetType::Int32),
2 => Ok(ParquetType::Int64),
3 => Ok(ParquetType::Int96),
4 => Ok(ParquetType::Float),
5 => Ok(ParquetType::Double),
6 => Ok(ParquetType::ByteArray),
7 => Ok(ParquetType::FixedLenByteArray(0)), _ => Err(ParquetError::UnsupportedType(format!(
"Unknown physical type code: {code}"
))),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Encoding {
Plain,
RleBitPacked,
DeltaBinaryPacked,
DeltaLengthByteArray,
DeltaByteArray,
}
impl Encoding {
pub fn from_thrift(code: i32) -> Result<Self> {
match code {
0 => Ok(Encoding::Plain),
4 => Ok(Encoding::RleBitPacked),
5 => Ok(Encoding::DeltaBinaryPacked),
6 => Ok(Encoding::DeltaLengthByteArray),
7 => Ok(Encoding::DeltaByteArray),
_ => Ok(Encoding::Plain),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Compression {
Uncompressed,
Snappy,
Gzip,
Lzo,
Brotli,
Lz4,
Zstd,
}
impl Compression {
pub fn from_thrift(code: i32) -> Result<Self> {
match code {
0 => Ok(Compression::Uncompressed),
1 => Ok(Compression::Snappy),
2 => Ok(Compression::Gzip),
3 => Ok(Compression::Lzo),
4 => Ok(Compression::Brotli),
5 => Ok(Compression::Lz4),
6 => Ok(Compression::Zstd),
_ => Err(ParquetError::UnsupportedCompression(format!(
"Unknown compression code: {code}"
))),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PageType {
DataPage,
IndexPage,
DictionaryPage,
DataPageV2,
}
impl PageType {
pub fn from_thrift(code: i32) -> Result<Self> {
match code {
0 => Ok(PageType::DataPage),
1 => Ok(PageType::IndexPage),
2 => Ok(PageType::DictionaryPage),
3 => Ok(PageType::DataPageV2),
_ => Err(ParquetError::DataError(format!(
"Unknown page type: {code}"
))),
}
}
}
#[derive(Debug, Error)]
pub enum ParquetError {
#[error("IO error: {0}")]
IoError(#[from] std::io::Error),
#[error("Invalid Parquet file: {0}")]
InvalidFile(String),
#[error("Unsupported physical type: {0}")]
UnsupportedType(String),
#[error("Unsupported compression: {0}")]
UnsupportedCompression(String),
#[error("Unsupported encoding: {0}")]
UnsupportedEncoding(String),
#[error("Data error: {0}")]
DataError(String),
#[error("Arrow conversion error: {0}")]
ArrowError(String),
#[error("Column index {0} out of range")]
ColumnOutOfRange(usize),
}
pub type Result<T> = std::result::Result<T, ParquetError>;
#[derive(Debug, Clone)]
pub struct ColumnMetadata {
pub name: String,
pub physical_type: ParquetType,
pub encoding: Encoding,
pub compression: Compression,
pub num_values: i64,
pub data_offset: i64,
pub total_compressed_size: i64,
pub total_uncompressed_size: i64,
}
#[derive(Debug, Clone)]
pub struct RowGroupMetadata {
pub columns: Vec<ColumnMetadata>,
pub num_rows: i64,
pub total_byte_size: i64,
}
#[derive(Debug, Clone)]
pub struct ParquetMetadata {
pub version: i32,
pub num_rows: i64,
pub num_columns: usize,
pub schema_names: Vec<String>,
pub row_groups: Vec<RowGroupMetadata>,
pub columns: Vec<ColumnMetadata>,
pub created_by: Option<String>,
}