mod field;
mod physical_type;
pub mod reshape;
mod schema;
use std::collections::BTreeMap;
use std::sync::Arc;
pub use field::Field;
pub use physical_type::*;
use polars_utils::pl_str::PlSmallStr;
pub use schema::{ArrowSchema, ArrowSchemaRef};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
pub type Metadata = BTreeMap<PlSmallStr, PlSmallStr>;
pub(crate) type Extension = Option<(PlSmallStr, Option<PlSmallStr>)>;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum ArrowDataType {
#[default]
Null,
Boolean,
Int8,
Int16,
Int32,
Int64,
UInt8,
UInt16,
UInt32,
UInt64,
Float16,
Float32,
Float64,
Timestamp(TimeUnit, Option<PlSmallStr>),
Date32,
Date64,
Time32(TimeUnit),
Time64(TimeUnit),
Duration(TimeUnit),
Interval(IntervalUnit),
Binary,
FixedSizeBinary(usize),
LargeBinary,
Utf8,
LargeUtf8,
List(Box<Field>),
FixedSizeList(Box<Field>, usize),
LargeList(Box<Field>),
Struct(Vec<Field>),
#[cfg_attr(feature = "serde", serde(skip))]
Union(Vec<Field>, Option<Vec<i32>>, UnionMode),
Map(Box<Field>, bool),
Dictionary(IntegerType, Box<ArrowDataType>, bool),
Decimal(usize, usize),
Decimal256(usize, usize),
Extension(PlSmallStr, Box<ArrowDataType>, Option<PlSmallStr>),
BinaryView,
Utf8View,
Unknown,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum UnionMode {
Dense,
Sparse,
}
impl UnionMode {
pub fn sparse(is_sparse: bool) -> Self {
if is_sparse {
Self::Sparse
} else {
Self::Dense
}
}
pub fn is_sparse(&self) -> bool {
matches!(self, Self::Sparse)
}
pub fn is_dense(&self) -> bool {
matches!(self, Self::Dense)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum TimeUnit {
Second,
Millisecond,
Microsecond,
Nanosecond,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum IntervalUnit {
YearMonth,
DayTime,
MonthDayNano,
}
impl ArrowDataType {
pub fn to_physical_type(&self) -> PhysicalType {
use ArrowDataType::*;
match self {
Null => PhysicalType::Null,
Boolean => PhysicalType::Boolean,
Int8 => PhysicalType::Primitive(PrimitiveType::Int8),
Int16 => PhysicalType::Primitive(PrimitiveType::Int16),
Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => {
PhysicalType::Primitive(PrimitiveType::Int32)
},
Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => {
PhysicalType::Primitive(PrimitiveType::Int64)
},
Decimal(_, _) => PhysicalType::Primitive(PrimitiveType::Int128),
Decimal256(_, _) => PhysicalType::Primitive(PrimitiveType::Int256),
UInt8 => PhysicalType::Primitive(PrimitiveType::UInt8),
UInt16 => PhysicalType::Primitive(PrimitiveType::UInt16),
UInt32 => PhysicalType::Primitive(PrimitiveType::UInt32),
UInt64 => PhysicalType::Primitive(PrimitiveType::UInt64),
Float16 => PhysicalType::Primitive(PrimitiveType::Float16),
Float32 => PhysicalType::Primitive(PrimitiveType::Float32),
Float64 => PhysicalType::Primitive(PrimitiveType::Float64),
Interval(IntervalUnit::DayTime) => PhysicalType::Primitive(PrimitiveType::DaysMs),
Interval(IntervalUnit::MonthDayNano) => {
PhysicalType::Primitive(PrimitiveType::MonthDayNano)
},
Binary => PhysicalType::Binary,
FixedSizeBinary(_) => PhysicalType::FixedSizeBinary,
LargeBinary => PhysicalType::LargeBinary,
Utf8 => PhysicalType::Utf8,
LargeUtf8 => PhysicalType::LargeUtf8,
BinaryView => PhysicalType::BinaryView,
Utf8View => PhysicalType::Utf8View,
List(_) => PhysicalType::List,
FixedSizeList(_, _) => PhysicalType::FixedSizeList,
LargeList(_) => PhysicalType::LargeList,
Struct(_) => PhysicalType::Struct,
Union(_, _, _) => PhysicalType::Union,
Map(_, _) => PhysicalType::Map,
Dictionary(key, _, _) => PhysicalType::Dictionary(*key),
Extension(_, key, _) => key.to_physical_type(),
Unknown => unimplemented!(),
}
}
pub fn underlying_physical_type(&self) -> ArrowDataType {
use ArrowDataType::*;
match self {
Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => Int32,
Date64
| Timestamp(_, _)
| Time64(_)
| Duration(_)
| Interval(IntervalUnit::DayTime) => Int64,
Interval(IntervalUnit::MonthDayNano) => unimplemented!(),
Binary => Binary,
List(field) => List(Box::new(Field {
dtype: field.dtype.underlying_physical_type(),
..*field.clone()
})),
LargeList(field) => LargeList(Box::new(Field {
dtype: field.dtype.underlying_physical_type(),
..*field.clone()
})),
FixedSizeList(field, width) => FixedSizeList(
Box::new(Field {
dtype: field.dtype.underlying_physical_type(),
..*field.clone()
}),
*width,
),
Struct(fields) => Struct(
fields
.iter()
.map(|field| Field {
dtype: field.dtype.underlying_physical_type(),
..field.clone()
})
.collect(),
),
Dictionary(keys, _, _) => (*keys).into(),
Union(_, _, _) => unimplemented!(),
Map(_, _) => unimplemented!(),
Extension(_, inner, _) => inner.underlying_physical_type(),
_ => self.clone(),
}
}
pub fn to_logical_type(&self) -> &ArrowDataType {
use ArrowDataType::*;
match self {
Extension(_, key, _) => key.to_logical_type(),
_ => self,
}
}
pub fn inner_dtype(&self) -> Option<&ArrowDataType> {
match self {
ArrowDataType::List(inner) => Some(inner.dtype()),
ArrowDataType::LargeList(inner) => Some(inner.dtype()),
ArrowDataType::FixedSizeList(inner, _) => Some(inner.dtype()),
_ => None,
}
}
pub fn is_nested(&self) -> bool {
use ArrowDataType as D;
matches!(
self,
D::List(_)
| D::LargeList(_)
| D::FixedSizeList(_, _)
| D::Struct(_)
| D::Union(_, _, _)
| D::Map(_, _)
| D::Dictionary(_, _, _)
| D::Extension(_, _, _)
)
}
pub fn is_view(&self) -> bool {
matches!(self, ArrowDataType::Utf8View | ArrowDataType::BinaryView)
}
pub fn is_numeric(&self) -> bool {
use ArrowDataType as D;
matches!(
self,
D::Int8
| D::Int16
| D::Int32
| D::Int64
| D::UInt8
| D::UInt16
| D::UInt32
| D::UInt64
| D::Float32
| D::Float64
| D::Decimal(_, _)
| D::Decimal256(_, _)
)
}
pub fn to_fixed_size_list(self, size: usize, is_nullable: bool) -> ArrowDataType {
ArrowDataType::FixedSizeList(
Box::new(Field::new(
PlSmallStr::from_static("item"),
self,
is_nullable,
)),
size,
)
}
}
impl From<IntegerType> for ArrowDataType {
fn from(item: IntegerType) -> Self {
match item {
IntegerType::Int8 => ArrowDataType::Int8,
IntegerType::Int16 => ArrowDataType::Int16,
IntegerType::Int32 => ArrowDataType::Int32,
IntegerType::Int64 => ArrowDataType::Int64,
IntegerType::UInt8 => ArrowDataType::UInt8,
IntegerType::UInt16 => ArrowDataType::UInt16,
IntegerType::UInt32 => ArrowDataType::UInt32,
IntegerType::UInt64 => ArrowDataType::UInt64,
}
}
}
impl From<PrimitiveType> for ArrowDataType {
fn from(item: PrimitiveType) -> Self {
match item {
PrimitiveType::Int8 => ArrowDataType::Int8,
PrimitiveType::Int16 => ArrowDataType::Int16,
PrimitiveType::Int32 => ArrowDataType::Int32,
PrimitiveType::Int64 => ArrowDataType::Int64,
PrimitiveType::UInt8 => ArrowDataType::UInt8,
PrimitiveType::UInt16 => ArrowDataType::UInt16,
PrimitiveType::UInt32 => ArrowDataType::UInt32,
PrimitiveType::UInt64 => ArrowDataType::UInt64,
PrimitiveType::Int128 => ArrowDataType::Decimal(32, 32),
PrimitiveType::Int256 => ArrowDataType::Decimal256(32, 32),
PrimitiveType::Float16 => ArrowDataType::Float16,
PrimitiveType::Float32 => ArrowDataType::Float32,
PrimitiveType::Float64 => ArrowDataType::Float64,
PrimitiveType::DaysMs => ArrowDataType::Interval(IntervalUnit::DayTime),
PrimitiveType::MonthDayNano => ArrowDataType::Interval(IntervalUnit::MonthDayNano),
PrimitiveType::UInt128 => unimplemented!(),
}
}
}
pub type SchemaRef = Arc<ArrowSchema>;
pub fn get_extension(metadata: &Metadata) -> Extension {
if let Some(name) = metadata.get(&PlSmallStr::from_static("ARROW:extension:name")) {
let metadata = metadata
.get(&PlSmallStr::from_static("ARROW:extension:metadata"))
.cloned();
Some((name.clone(), metadata))
} else {
None
}
}
#[cfg(not(feature = "bigidx"))]
pub type IdxArr = super::array::UInt32Array;
#[cfg(feature = "bigidx")]
pub type IdxArr = super::array::UInt64Array;