use std::fmt;
use crate::field::Field;
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum DataType {
Null,
Boolean,
Int8,
Int16,
Int32,
Int64,
UInt8,
UInt16,
UInt32,
UInt64,
Float16,
Float32,
Float64,
Timestamp(TimeUnit, Option<String>),
Date32,
Date64,
Time32(TimeUnit),
Time64(TimeUnit),
Duration(TimeUnit),
Interval(IntervalUnit),
Binary,
FixedSizeBinary(i32),
LargeBinary,
Utf8,
LargeUtf8,
List(Box<Field>),
FixedSizeList(Box<Field>, i32),
LargeList(Box<Field>),
Struct(Vec<Field>),
Union(Vec<Field>, Vec<i8>, UnionMode),
Dictionary(Box<DataType>, Box<DataType>),
Decimal128(u8, i8),
Decimal256(u8, i8),
Map(Box<Field>, bool),
RunEndEncoded(Box<Field>, Box<Field>),
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum TimeUnit {
Second,
Millisecond,
Microsecond,
Nanosecond,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum IntervalUnit {
YearMonth,
DayTime,
MonthDayNano,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum UnionMode {
Sparse,
Dense,
}
impl fmt::Display for DataType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{self:?}")
}
}
impl DataType {
#[inline]
pub fn is_primitive(&self) -> bool {
self.is_numeric() || self.is_temporal()
}
#[inline]
pub fn is_numeric(&self) -> bool {
use DataType::*;
matches!(
self,
UInt8
| UInt16
| UInt32
| UInt64
| Int8
| Int16
| Int32
| Int64
| Float16
| Float32
| Float64
| Decimal128(_, _)
| Decimal256(_, _)
)
}
#[inline]
pub fn is_temporal(&self) -> bool {
use DataType::*;
matches!(
self,
Date32
| Date64
| Timestamp(_, _)
| Time32(_)
| Time64(_)
| Duration(_)
| Interval(_)
)
}
#[inline]
pub fn is_dictionary_key_type(&self) -> bool {
use DataType::*;
matches!(
self,
UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
)
}
#[inline]
pub fn is_run_ends_type(&self) -> bool {
use DataType::*;
matches!(self, Int16 | Int32 | Int64)
}
pub fn is_nested(&self) -> bool {
use DataType::*;
match self {
Dictionary(_, v) => DataType::is_nested(v.as_ref()),
List(_)
| FixedSizeList(_, _)
| LargeList(_)
| Struct(_)
| Union(_, _, _)
| Map(_, _) => true,
_ => false,
}
}
pub fn equals_datatype(&self, other: &DataType) -> bool {
match (&self, other) {
(DataType::List(a), DataType::List(b))
| (DataType::LargeList(a), DataType::LargeList(b)) => {
a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
}
(DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
a_size == b_size
&& a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
}
(DataType::Struct(a), DataType::Struct(b)) => {
a.len() == b.len()
&& a.iter().zip(b).all(|(a, b)| {
a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
})
}
(
DataType::Map(a_field, a_is_sorted),
DataType::Map(b_field, b_is_sorted),
) => a_field == b_field && a_is_sorted == b_is_sorted,
_ => self == other,
}
}
#[inline]
pub fn primitive_width(&self) -> Option<usize> {
match self {
DataType::Null => None,
DataType::Boolean => None,
DataType::Int8 | DataType::UInt8 => Some(1),
DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2),
DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4),
DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8),
DataType::Timestamp(_, _) => Some(8),
DataType::Date32 | DataType::Time32(_) => Some(4),
DataType::Date64 | DataType::Time64(_) => Some(8),
DataType::Duration(_) => Some(8),
DataType::Interval(IntervalUnit::YearMonth) => Some(4),
DataType::Interval(IntervalUnit::DayTime) => Some(8),
DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
DataType::Decimal128(_, _) => Some(16),
DataType::Decimal256(_, _) => Some(32),
DataType::Utf8 | DataType::LargeUtf8 => None,
DataType::Binary | DataType::LargeBinary => None,
DataType::FixedSizeBinary(_) => None,
DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) => None,
DataType::FixedSizeList(_, _) => None,
DataType::Struct(_) => None,
DataType::Union(_, _, _) => None,
DataType::Dictionary(_, _) => None,
DataType::RunEndEncoded(_, _) => None,
}
}
pub fn size(&self) -> usize {
std::mem::size_of_val(self)
+ match self {
DataType::Null
| DataType::Boolean
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Float16
| DataType::Float32
| DataType::Float64
| DataType::Date32
| DataType::Date64
| DataType::Time32(_)
| DataType::Time64(_)
| DataType::Duration(_)
| DataType::Interval(_)
| DataType::Binary
| DataType::FixedSizeBinary(_)
| DataType::LargeBinary
| DataType::Utf8
| DataType::LargeUtf8
| DataType::Decimal128(_, _)
| DataType::Decimal256(_, _) => 0,
DataType::Timestamp(_, s) => {
s.as_ref().map(|s| s.capacity()).unwrap_or_default()
}
DataType::List(field)
| DataType::FixedSizeList(field, _)
| DataType::LargeList(field)
| DataType::Map(field, _) => field.size(),
DataType::Struct(fields) | DataType::Union(fields, _, _) => {
fields
.iter()
.map(|field| field.size() - std::mem::size_of_val(field))
.sum::<usize>()
+ (std::mem::size_of::<Field>() * fields.capacity())
}
DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(),
DataType::RunEndEncoded(run_ends, values) => {
run_ends.size() - std::mem::size_of_val(run_ends) + values.size()
- std::mem::size_of_val(values)
}
}
}
}
pub const DECIMAL128_MAX_PRECISION: u8 = 38;
pub const DECIMAL128_MAX_SCALE: i8 = 38;
pub const DECIMAL256_MAX_PRECISION: u8 = 76;
pub const DECIMAL256_MAX_SCALE: i8 = 76;
pub const DECIMAL_DEFAULT_SCALE: i8 = 10;
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "serde")]
fn serde_struct_type() {
use std::collections::HashMap;
let kv_array = [("k".to_string(), "v".to_string())];
let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
let first_name =
Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
let last_name = Field::new("last_name", DataType::Utf8, false)
.with_metadata(HashMap::default());
let person = DataType::Struct(vec![
first_name,
last_name,
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
let serialized = serde_json::to_string(&person).unwrap();
assert_eq!(
"{\"Struct\":[\
{\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\
{\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
{\"name\":\"address\",\"data_type\":{\"Struct\":\
[{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
{\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\
]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}",
serialized
);
let deserialized = serde_json::from_str(&serialized).unwrap();
assert_eq!(person, deserialized);
}
#[test]
fn test_list_datatype_equality() {
let list_a = DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_b = DataType::List(Box::new(Field::new("array", DataType::Int32, true)));
let list_c = DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_d = DataType::List(Box::new(Field::new("item", DataType::UInt32, true)));
assert!(list_a.equals_datatype(&list_b));
assert!(!list_a.equals_datatype(&list_c));
assert!(!list_b.equals_datatype(&list_c));
assert!(!list_a.equals_datatype(&list_d));
let list_e =
DataType::FixedSizeList(Box::new(Field::new("item", list_a, false)), 3);
let list_f =
DataType::FixedSizeList(Box::new(Field::new("array", list_b, false)), 3);
let list_g = DataType::FixedSizeList(
Box::new(Field::new("item", DataType::FixedSizeBinary(3), true)),
3,
);
assert!(list_e.equals_datatype(&list_f));
assert!(!list_e.equals_datatype(&list_g));
assert!(!list_f.equals_datatype(&list_g));
let list_h = DataType::Struct(vec![Field::new("f1", list_e, true)]);
let list_i = DataType::Struct(vec![Field::new("f1", list_f.clone(), true)]);
let list_j = DataType::Struct(vec![Field::new("f1", list_f.clone(), false)]);
let list_k = DataType::Struct(vec![
Field::new("f1", list_f.clone(), false),
Field::new("f2", list_g.clone(), false),
Field::new("f3", DataType::Utf8, true),
]);
let list_l = DataType::Struct(vec![
Field::new("ff1", list_f.clone(), false),
Field::new("ff2", list_g.clone(), false),
Field::new("ff3", DataType::LargeUtf8, true),
]);
let list_m = DataType::Struct(vec![
Field::new("ff1", list_f, false),
Field::new("ff2", list_g, false),
Field::new("ff3", DataType::Utf8, true),
]);
assert!(list_h.equals_datatype(&list_i));
assert!(!list_h.equals_datatype(&list_j));
assert!(!list_k.equals_datatype(&list_l));
assert!(list_k.equals_datatype(&list_m));
}
#[test]
fn create_struct_type() {
let _person = DataType::Struct(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
}
#[test]
fn test_nested() {
let list = DataType::List(Box::new(Field::new("foo", DataType::Utf8, true)));
assert!(!DataType::is_nested(&DataType::Boolean));
assert!(!DataType::is_nested(&DataType::Int32));
assert!(!DataType::is_nested(&DataType::Utf8));
assert!(DataType::is_nested(&list));
assert!(!DataType::is_nested(&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::Boolean)
)));
assert!(!DataType::is_nested(&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::Int64)
)));
assert!(!DataType::is_nested(&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::LargeUtf8)
)));
assert!(DataType::is_nested(&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(list)
)));
}
}