use crate::error::IoError;
#[cfg(feature = "datetime")]
use minarrow::TimeUnit;
use minarrow::{ArrowType, ffi::arrow_dtype::CategoricalIndexType};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ParquetPhysicalType {
Boolean = 0,
Int32 = 1,
Int64 = 2,
Float = 3,
Double = 4,
ByteArray = 6,
}
impl ParquetPhysicalType {
pub fn as_i32(self) -> i32 {
self as i32
}
pub fn from_i32(val: i32) -> Option<Self> {
match val {
0 => Some(Self::Boolean),
1 => Some(Self::Int32),
2 => Some(Self::Int64),
3 => Some(Self::Float),
4 => Some(Self::Double),
6 => Some(Self::ByteArray),
_ => None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum ParquetLogicalType {
NoneType,
Utf8,
#[cfg(feature = "datetime")]
Date32,
#[cfg(feature = "datetime")]
Date64,
#[cfg(feature = "datetime")]
TimestampMillis,
#[cfg(feature = "datetime")]
TimestampMicros,
#[cfg(feature = "datetime")]
TimestampNanos,
#[cfg(feature = "datetime")]
TimeMillis,
#[cfg(feature = "datetime")]
TimeMicros,
#[cfg(feature = "datetime")]
TimeNanos,
IntType {
bit_width: u8,
is_signed: bool,
},
}
impl ParquetLogicalType {
pub fn from_converted_type(id: Option<i32>) -> Option<Self> {
match id {
None => None,
Some(0) => None, Some(1) => Some(ParquetLogicalType::Utf8),
Some(2) => None, Some(3) => None, #[cfg(feature = "datetime")]
Some(4) => Some(ParquetLogicalType::Date32),
#[cfg(feature = "datetime")]
Some(5) => Some(ParquetLogicalType::Date64),
#[cfg(feature = "datetime")]
Some(6) => Some(ParquetLogicalType::TimeMillis),
#[cfg(feature = "datetime")]
Some(7) => Some(ParquetLogicalType::TimeMicros),
Some(8) => None, #[cfg(feature = "datetime")]
Some(9) => Some(ParquetLogicalType::TimestampMillis),
#[cfg(feature = "datetime")]
Some(10) => Some(ParquetLogicalType::TimestampMicros),
Some(11) => Some(ParquetLogicalType::IntType {
bit_width: 8,
is_signed: true,
}),
Some(12) => Some(ParquetLogicalType::IntType {
bit_width: 16,
is_signed: true,
}),
Some(13) => Some(ParquetLogicalType::IntType {
bit_width: 32,
is_signed: true,
}),
Some(14) => Some(ParquetLogicalType::IntType {
bit_width: 64,
is_signed: true,
}),
Some(15) => Some(ParquetLogicalType::IntType {
bit_width: 8,
is_signed: false,
}),
Some(16) => Some(ParquetLogicalType::IntType {
bit_width: 16,
is_signed: false,
}),
Some(17) => Some(ParquetLogicalType::IntType {
bit_width: 32,
is_signed: false,
}),
Some(18) => Some(ParquetLogicalType::IntType {
bit_width: 64,
is_signed: false,
}),
Some(19) => None, Some(20) => None, Some(21) => None, Some(22) => None, Some(23) => None, Some(24) => None, _ => None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ParquetEncoding {
Plain = 0,
PlainDictionary = 2,
Rle = 3,
BitPacked = 4,
DeltaBinaryPacked = 5,
DeltaLengthByteArray = 6,
DeltaByteArray = 7,
RleDictionary = 8,
ByteStreamSplit = 9,
}
impl ParquetEncoding {
pub fn from_i32(v: i32) -> Option<Self> {
match v {
0 => Some(Self::Plain),
2 => Some(Self::PlainDictionary),
3 => Some(Self::Rle),
4 => Some(Self::BitPacked),
5 => Some(Self::DeltaBinaryPacked),
6 => Some(Self::DeltaLengthByteArray),
7 => Some(Self::DeltaByteArray),
8 => Some(Self::RleDictionary),
9 => Some(Self::ByteStreamSplit),
_ => None,
}
}
pub fn to_i32(self) -> i32 {
self as i32
}
}
pub(crate) fn arrow_type_to_parquet(
ty: &ArrowType,
) -> Result<(ParquetPhysicalType, ParquetLogicalType), IoError> {
match ty {
ArrowType::Boolean => Ok((ParquetPhysicalType::Boolean, ParquetLogicalType::NoneType)),
#[cfg(feature = "extended_numeric_types")]
ArrowType::Int8 => Ok((
ParquetPhysicalType::Int32,
ParquetLogicalType::IntType {
bit_width: 8,
is_signed: true,
},
)),
#[cfg(feature = "extended_numeric_types")]
ArrowType::Int16 => Ok((
ParquetPhysicalType::Int32,
ParquetLogicalType::IntType {
bit_width: 16,
is_signed: true,
},
)),
ArrowType::Int32 => Ok((
ParquetPhysicalType::Int32,
ParquetLogicalType::IntType {
bit_width: 32,
is_signed: true,
},
)),
ArrowType::Int64 => Ok((
ParquetPhysicalType::Int64,
ParquetLogicalType::IntType {
bit_width: 64,
is_signed: true,
},
)),
#[cfg(feature = "extended_numeric_types")]
ArrowType::UInt8 => Ok((
ParquetPhysicalType::Int32,
ParquetLogicalType::IntType {
bit_width: 8,
is_signed: false,
},
)),
#[cfg(feature = "extended_numeric_types")]
ArrowType::UInt16 => Ok((
ParquetPhysicalType::Int32,
ParquetLogicalType::IntType {
bit_width: 16,
is_signed: false,
},
)),
ArrowType::UInt32 => Ok((
ParquetPhysicalType::Int32,
ParquetLogicalType::IntType {
bit_width: 32,
is_signed: false,
},
)),
ArrowType::UInt64 => Ok((
ParquetPhysicalType::Int64,
ParquetLogicalType::IntType {
bit_width: 64,
is_signed: false,
},
)),
ArrowType::Dictionary(CategoricalIndexType::UInt32) => {
Ok((ParquetPhysicalType::Int32, ParquetLogicalType::NoneType))
}
ArrowType::Float32 => Ok((ParquetPhysicalType::Float, ParquetLogicalType::NoneType)),
ArrowType::Float64 => Ok((ParquetPhysicalType::Double, ParquetLogicalType::NoneType)),
ArrowType::String => Ok((ParquetPhysicalType::ByteArray, ParquetLogicalType::Utf8)),
#[cfg(feature = "large_string")]
ArrowType::LargeString => Ok((ParquetPhysicalType::ByteArray, ParquetLogicalType::Utf8)),
ArrowType::Utf8View => Ok((ParquetPhysicalType::ByteArray, ParquetLogicalType::Utf8)),
#[cfg(feature = "datetime")]
ArrowType::Date32 => Ok((ParquetPhysicalType::Int32, ParquetLogicalType::Date32)),
#[cfg(feature = "datetime")]
ArrowType::Date64 => Ok((ParquetPhysicalType::Int64, ParquetLogicalType::Date64)),
#[cfg(feature = "datetime")]
ArrowType::Timestamp(unit, _) => match unit {
TimeUnit::Milliseconds => Ok((
ParquetPhysicalType::Int64,
ParquetLogicalType::TimestampMillis,
)),
TimeUnit::Microseconds => Ok((
ParquetPhysicalType::Int64,
ParquetLogicalType::TimestampMicros,
)),
TimeUnit::Nanoseconds => Ok((
ParquetPhysicalType::Int64,
ParquetLogicalType::TimestampNanos,
)),
TimeUnit::Seconds => Ok((
ParquetPhysicalType::Int64,
ParquetLogicalType::TimestampMillis,
)), TimeUnit::Days => Ok((ParquetPhysicalType::Int64, ParquetLogicalType::Date64)),
},
#[cfg(feature = "datetime")]
ArrowType::Time32(unit) => match unit {
TimeUnit::Milliseconds => {
Ok((ParquetPhysicalType::Int32, ParquetLogicalType::TimeMillis))
}
TimeUnit::Microseconds => {
Ok((ParquetPhysicalType::Int32, ParquetLogicalType::TimeMicros))
}
TimeUnit::Nanoseconds => {
Ok((ParquetPhysicalType::Int32, ParquetLogicalType::TimeNanos))
}
TimeUnit::Seconds => Ok((ParquetPhysicalType::Int32, ParquetLogicalType::TimeMillis)),
TimeUnit::Days => Ok((ParquetPhysicalType::Int32, ParquetLogicalType::Date32)),
},
#[cfg(feature = "datetime")]
ArrowType::Time64(unit) => match unit {
TimeUnit::Milliseconds => {
Ok((ParquetPhysicalType::Int64, ParquetLogicalType::TimeMillis))
}
TimeUnit::Microseconds => {
Ok((ParquetPhysicalType::Int64, ParquetLogicalType::TimeMicros))
}
TimeUnit::Nanoseconds => {
Ok((ParquetPhysicalType::Int64, ParquetLogicalType::TimeNanos))
}
TimeUnit::Seconds => Ok((ParquetPhysicalType::Int64, ParquetLogicalType::TimeMillis)),
TimeUnit::Days => Ok((ParquetPhysicalType::Int64, ParquetLogicalType::Date64)),
},
ArrowType::Null => Err(IoError::UnsupportedType(
"Null type is not supported".into(),
)),
#[cfg(feature = "datetime")]
ArrowType::Duration32(_) => panic!("Duration does not map to a parquet type."),
#[cfg(feature = "datetime")]
ArrowType::Duration64(_) => panic!("Duration does not map to a parquet type."),
#[cfg(feature = "datetime")]
ArrowType::Interval(_) => panic!("Interval does not map to a parquet type."),
#[cfg(all(feature = "extended_categorical", feature = "extended_numeric_types"))]
&minarrow::ArrowType::Dictionary(
minarrow::ffi::arrow_dtype::CategoricalIndexType::UInt8,
)
| &minarrow::ArrowType::Dictionary(
minarrow::ffi::arrow_dtype::CategoricalIndexType::UInt16,
)
| &minarrow::ArrowType::Dictionary(
minarrow::ffi::arrow_dtype::CategoricalIndexType::UInt64,
) => panic!(),
}
}
pub(crate) fn parquet_to_arrow_type(
physical: ParquetPhysicalType,
logical: Option<ParquetLogicalType>,
) -> Result<ArrowType, IoError> {
match (physical, logical.clone()) {
(ParquetPhysicalType::Boolean, _) => Ok(ArrowType::Boolean),
#[cfg(feature = "extended_numeric_types")]
(
ParquetPhysicalType::Int32,
Some(ParquetLogicalType::IntType {
bit_width: 8,
is_signed: true,
}),
) => Ok(ArrowType::Int8),
#[cfg(feature = "extended_numeric_types")]
(
ParquetPhysicalType::Int32,
Some(ParquetLogicalType::IntType {
bit_width: 16,
is_signed: true,
}),
) => Ok(ArrowType::Int16),
(
ParquetPhysicalType::Int32,
Some(ParquetLogicalType::IntType {
bit_width: 32,
is_signed: true,
}),
) => Ok(ArrowType::Int32),
#[cfg(feature = "extended_numeric_types")]
(
ParquetPhysicalType::Int32,
Some(ParquetLogicalType::IntType {
bit_width: 8,
is_signed: false,
}),
) => Ok(ArrowType::UInt8),
#[cfg(feature = "extended_numeric_types")]
(
ParquetPhysicalType::Int32,
Some(ParquetLogicalType::IntType {
bit_width: 16,
is_signed: false,
}),
) => Ok(ArrowType::UInt16),
(
ParquetPhysicalType::Int32,
Some(ParquetLogicalType::IntType {
bit_width: 32,
is_signed: false,
}),
) => Ok(ArrowType::UInt32),
(
ParquetPhysicalType::Int64,
Some(ParquetLogicalType::IntType {
bit_width: 64,
is_signed: true,
}),
) => Ok(ArrowType::Int64),
(
ParquetPhysicalType::Int64,
Some(ParquetLogicalType::IntType {
bit_width: 64,
is_signed: false,
}),
) => Ok(ArrowType::UInt64),
(ParquetPhysicalType::Int32, None) => Ok(ArrowType::Int32),
(ParquetPhysicalType::Int64, None) => Ok(ArrowType::Int64),
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int32, Some(ParquetLogicalType::Date32)) => Ok(ArrowType::Date32),
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int64, Some(ParquetLogicalType::Date64)) => Ok(ArrowType::Date64),
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int64, Some(ParquetLogicalType::TimestampMillis)) => {
Ok(ArrowType::Timestamp(TimeUnit::Milliseconds, None))
}
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int64, Some(ParquetLogicalType::TimestampMicros)) => {
Ok(ArrowType::Timestamp(TimeUnit::Microseconds, None))
}
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int64, Some(ParquetLogicalType::TimestampNanos)) => {
Ok(ArrowType::Timestamp(TimeUnit::Nanoseconds, None))
}
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int32, Some(ParquetLogicalType::TimeMillis)) => {
Ok(ArrowType::Time32(TimeUnit::Milliseconds))
}
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int32, Some(ParquetLogicalType::TimeMicros)) => {
Ok(ArrowType::Time32(TimeUnit::Microseconds))
}
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int32, Some(ParquetLogicalType::TimeNanos)) => {
Ok(ArrowType::Time32(TimeUnit::Nanoseconds))
}
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int64, Some(ParquetLogicalType::TimeMillis)) => {
Ok(ArrowType::Time64(TimeUnit::Milliseconds))
}
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int64, Some(ParquetLogicalType::TimeMicros)) => {
Ok(ArrowType::Time64(TimeUnit::Microseconds))
}
#[cfg(feature = "datetime")]
(ParquetPhysicalType::Int64, Some(ParquetLogicalType::TimeNanos)) => {
Ok(ArrowType::Time64(TimeUnit::Nanoseconds))
}
(ParquetPhysicalType::Float, _) => Ok(ArrowType::Float32),
(ParquetPhysicalType::Double, _) => Ok(ArrowType::Float64),
#[cfg(not(feature = "large_string"))]
(ParquetPhysicalType::ByteArray, Some(ParquetLogicalType::Utf8)) => Ok(ArrowType::String),
#[cfg(feature = "large_string")]
(ParquetPhysicalType::ByteArray, Some(ParquetLogicalType::Utf8)) => {
Ok(ArrowType::LargeString)
}
(ParquetPhysicalType::ByteArray, None) => {
Err(IoError::UnsupportedType("Binary not supported".into()))
}
_ => Err(IoError::UnsupportedType(format!(
"Parquet type {:?} + logical {:?} not supported",
physical, logical
))),
}
}