use std::fmt;
use crate::field::Field;
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum DataType {
Null,
Boolean,
Int8,
Int16,
Int32,
Int64,
UInt8,
UInt16,
UInt32,
UInt64,
Float16,
Float32,
Float64,
Timestamp(TimeUnit, Option<String>),
Date32,
Date64,
Time32(TimeUnit),
Time64(TimeUnit),
Duration(TimeUnit),
Interval(IntervalUnit),
Binary,
FixedSizeBinary(i32),
LargeBinary,
Utf8,
LargeUtf8,
List(Box<Field>),
FixedSizeList(Box<Field>, i32),
LargeList(Box<Field>),
Struct(Vec<Field>),
Union(Vec<Field>, Vec<i8>, UnionMode),
Dictionary(Box<DataType>, Box<DataType>),
Decimal128(u8, u8),
Decimal256(u8, u8),
Map(Box<Field>, bool),
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum TimeUnit {
Second,
Millisecond,
Microsecond,
Nanosecond,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum IntervalUnit {
YearMonth,
DayTime,
MonthDayNano,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum UnionMode {
Sparse,
Dense,
}
impl fmt::Display for DataType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self)
}
}
impl DataType {
pub fn is_primitive(t: &DataType) -> bool {
use DataType::*;
matches!(
t,
Int8 | Int16
| Int32
| Int64
| UInt8
| UInt16
| UInt32
| UInt64
| Float32
| Float64
| Date32
| Date64
| Time32(_)
| Time64(_)
| Timestamp(_, _)
| Interval(_)
| Duration(_)
)
}
pub fn is_numeric(t: &DataType) -> bool {
use DataType::*;
matches!(
t,
UInt8
| UInt16
| UInt32
| UInt64
| Int8
| Int16
| Int32
| Int64
| Float32
| Float64
)
}
pub fn is_temporal(t: &DataType) -> bool {
use DataType::*;
matches!(
t,
Date32
| Date64
| Timestamp(_, _)
| Time32(_)
| Time64(_)
| Duration(_)
| Interval(_)
)
}
pub fn is_dictionary_key_type(t: &DataType) -> bool {
use DataType::*;
matches!(
t,
UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
)
}
pub fn is_nested(t: &DataType) -> bool {
use DataType::*;
match t {
Dictionary(_, v) => DataType::is_nested(v.as_ref()),
List(_)
| FixedSizeList(_, _)
| LargeList(_)
| Struct(_)
| Union(_, _, _)
| Map(_, _) => true,
_ => false,
}
}
pub fn equals_datatype(&self, other: &DataType) -> bool {
match (&self, other) {
(DataType::List(a), DataType::List(b))
| (DataType::LargeList(a), DataType::LargeList(b)) => {
a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
}
(DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
a_size == b_size
&& a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
}
(DataType::Struct(a), DataType::Struct(b)) => {
a.len() == b.len()
&& a.iter().zip(b).all(|(a, b)| {
a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
})
}
(
DataType::Map(a_field, a_is_sorted),
DataType::Map(b_field, b_is_sorted),
) => a_field == b_field && a_is_sorted == b_is_sorted,
_ => self == other,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "serde")]
fn serde_struct_type() {
use std::collections::BTreeMap;
let kv_array = [("k".to_string(), "v".to_string())];
let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
let first_name = Field::new("first_name", DataType::Utf8, false)
.with_metadata(Some(field_metadata));
let last_name = Field::new("last_name", DataType::Utf8, false)
.with_metadata(Some(BTreeMap::default()));
let person = DataType::Struct(vec![
first_name,
last_name,
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
let serialized = serde_json::to_string(&person).unwrap();
assert_eq!(
"{\"Struct\":[\
{\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\
{\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\
{\"name\":\"address\",\"data_type\":{\"Struct\":\
[{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\
{\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\
]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}",
serialized
);
let deserialized = serde_json::from_str(&serialized).unwrap();
assert_eq!(person, deserialized);
}
#[test]
fn test_list_datatype_equality() {
let list_a = DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_b = DataType::List(Box::new(Field::new("array", DataType::Int32, true)));
let list_c = DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_d = DataType::List(Box::new(Field::new("item", DataType::UInt32, true)));
assert!(list_a.equals_datatype(&list_b));
assert!(!list_a.equals_datatype(&list_c));
assert!(!list_b.equals_datatype(&list_c));
assert!(!list_a.equals_datatype(&list_d));
let list_e =
DataType::FixedSizeList(Box::new(Field::new("item", list_a, false)), 3);
let list_f =
DataType::FixedSizeList(Box::new(Field::new("array", list_b, false)), 3);
let list_g = DataType::FixedSizeList(
Box::new(Field::new("item", DataType::FixedSizeBinary(3), true)),
3,
);
assert!(list_e.equals_datatype(&list_f));
assert!(!list_e.equals_datatype(&list_g));
assert!(!list_f.equals_datatype(&list_g));
let list_h = DataType::Struct(vec![Field::new("f1", list_e, true)]);
let list_i = DataType::Struct(vec![Field::new("f1", list_f.clone(), true)]);
let list_j = DataType::Struct(vec![Field::new("f1", list_f.clone(), false)]);
let list_k = DataType::Struct(vec![
Field::new("f1", list_f.clone(), false),
Field::new("f2", list_g.clone(), false),
Field::new("f3", DataType::Utf8, true),
]);
let list_l = DataType::Struct(vec![
Field::new("ff1", list_f.clone(), false),
Field::new("ff2", list_g.clone(), false),
Field::new("ff3", DataType::LargeUtf8, true),
]);
let list_m = DataType::Struct(vec![
Field::new("ff1", list_f, false),
Field::new("ff2", list_g, false),
Field::new("ff3", DataType::Utf8, true),
]);
assert!(list_h.equals_datatype(&list_i));
assert!(!list_h.equals_datatype(&list_j));
assert!(!list_k.equals_datatype(&list_l));
assert!(list_k.equals_datatype(&list_m));
}
#[test]
fn create_struct_type() {
let _person = DataType::Struct(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
}
#[test]
fn test_nested() {
let list = DataType::List(Box::new(Field::new("foo", DataType::Utf8, true)));
assert!(!DataType::is_nested(&DataType::Boolean));
assert!(!DataType::is_nested(&DataType::Int32));
assert!(!DataType::is_nested(&DataType::Utf8));
assert!(DataType::is_nested(&list));
assert!(!DataType::is_nested(&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::Boolean)
)));
assert!(!DataType::is_nested(&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::Int64)
)));
assert!(!DataType::is_nested(&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::LargeUtf8)
)));
assert!(DataType::is_nested(&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(list)
)));
}
}