polars_arrow/datatypes/
mod.rs

1//! Contains all metadata, such as [`PhysicalType`], [`ArrowDataType`], [`Field`] and [`ArrowSchema`].
2
3mod field;
4mod physical_type;
5pub mod reshape;
6mod schema;
7
8use std::collections::BTreeMap;
9use std::sync::Arc;
10
11pub use field::{
12    DTYPE_CATEGORICAL_LEGACY, DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY,
13    DTYPE_ENUM_VALUES_NEW, Field,
14};
15pub use physical_type::*;
16use polars_utils::pl_str::PlSmallStr;
17pub use schema::{ArrowSchema, ArrowSchemaRef};
18#[cfg(feature = "serde")]
19use serde::{Deserialize, Serialize};
20
21use crate::array::LIST_VALUES_NAME;
22
23/// typedef for [BTreeMap<PlSmallStr, PlSmallStr>] denoting [`Field`]'s and [`ArrowSchema`]'s metadata.
24pub type Metadata = BTreeMap<PlSmallStr, PlSmallStr>;
25/// typedef for [Option<(PlSmallStr, Option<PlSmallStr>)>] descr
26pub(crate) type Extension = Option<(PlSmallStr, Option<PlSmallStr>)>;
27
28/// The set of supported logical types in this crate.
29///
30/// Each variant uniquely identifies a logical type, which define specific semantics to the data
31/// (e.g. how it should be represented).
32/// Each variant has a corresponding [`PhysicalType`], obtained via [`ArrowDataType::to_physical_type`],
33/// which declares the in-memory representation of data.
34/// The [`ArrowDataType::Extension`] is special in that it augments a [`ArrowDataType`] with metadata to support custom types.
35/// Use `to_logical_type` to desugar such type and return its corresponding logical type.
36#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
37#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
38#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
39pub enum ArrowDataType {
40    /// Null type
41    #[default]
42    Null,
43    /// `true` and `false`.
44    Boolean,
45    /// An [`i8`]
46    Int8,
47    /// An [`i16`]
48    Int16,
49    /// An [`i32`]
50    Int32,
51    /// An [`i64`]
52    Int64,
53    /// An [`i128`]
54    Int128,
55    /// An [`u8`]
56    UInt8,
57    /// An [`u16`]
58    UInt16,
59    /// An [`u32`]
60    UInt32,
61    /// An [`u64`]
62    UInt64,
63    /// An 16-bit float
64    Float16,
65    /// A [`f32`]
66    Float32,
67    /// A [`f64`]
68    Float64,
69    /// A [`i64`] representing a timestamp measured in [`TimeUnit`] with an optional timezone.
70    ///
71    /// Time is measured as a Unix epoch, counting the seconds from
72    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
73    /// as a 64-bit signed integer.
74    ///
75    /// The time zone is a string indicating the name of a time zone, one of:
76    ///
77    /// * As used in the Olson time zone database (the "tz database" or
78    ///   "tzdata"), such as "America/New_York"
79    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
80    ///
81    /// When the timezone is not specified, the timestamp is considered to have no timezone
82    /// and is represented _as is_
83    Timestamp(TimeUnit, Option<PlSmallStr>),
84    /// An [`i32`] representing the elapsed time since UNIX epoch (1970-01-01)
85    /// in days.
86    Date32,
87    /// An [`i64`] representing the elapsed time since UNIX epoch (1970-01-01)
88    /// in milliseconds. Values are evenly divisible by 86400000.
89    Date64,
90    /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
91    /// Only [`TimeUnit::Second`] and [`TimeUnit::Millisecond`] are supported on this variant.
92    Time32(TimeUnit),
93    /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
94    /// Only [`TimeUnit::Microsecond`] and [`TimeUnit::Nanosecond`] are supported on this variant.
95    Time64(TimeUnit),
96    /// Measure of elapsed time. This elapsed time is a physical duration (i.e. 1s as defined in S.I.)
97    Duration(TimeUnit),
98    /// A "calendar" interval modeling elapsed time that takes into account calendar shifts.
99    /// For example an interval of 1 day may represent more than 24 hours.
100    Interval(IntervalUnit),
101    /// Opaque binary data of variable length whose offsets are represented as [`i32`].
102    Binary,
103    /// Opaque binary data of fixed size.
104    /// Enum parameter specifies the number of bytes per value.
105    FixedSizeBinary(usize),
106    /// Opaque binary data of variable length whose offsets are represented as [`i64`].
107    LargeBinary,
108    /// A variable-length UTF-8 encoded string whose offsets are represented as [`i32`].
109    Utf8,
110    /// A variable-length UTF-8 encoded string whose offsets are represented as [`i64`].
111    LargeUtf8,
112    /// A list of some logical data type whose offsets are represented as [`i32`].
113    List(Box<Field>),
114    /// A list of some logical data type with a fixed number of elements.
115    FixedSizeList(Box<Field>, usize),
116    /// A list of some logical data type whose offsets are represented as [`i64`].
117    LargeList(Box<Field>),
118    /// A nested [`ArrowDataType`] with a given number of [`Field`]s.
119    Struct(Vec<Field>),
120    /// A nested type that is represented as
121    ///
122    /// List<entries: Struct<key: K, value: V>>
123    ///
124    /// In this layout, the keys and values are each respectively contiguous. We do
125    /// not constrain the key and value types, so the application is responsible
126    /// for ensuring that the keys are hashable and unique. Whether the keys are sorted
127    /// may be set in the metadata for this field.
128    ///
129    /// In a field with Map type, the field has a child Struct field, which then
130    /// has two children: key type and the second the value type. The names of the
131    /// child fields may be respectively "entries", "key", and "value", but this is
132    /// not enforced.
133    ///
134    /// Map
135    /// ```text
136    ///   - child[0] entries: Struct
137    ///     - child[0] key: K
138    ///     - child[1] value: V
139    /// ```
140    /// Neither the "entries" field nor the "key" field may be nullable.
141    ///
142    /// The metadata is structured so that Arrow systems without special handling
143    /// for Map can make Map an alias for List. The "layout" attribute for the Map
144    /// field must have the same contents as a List.
145    /// - Field
146    /// - ordered
147    Map(Box<Field>, bool),
148    /// A dictionary encoded array (`key_type`, `value_type`), where
149    /// each array element is an index of `key_type` into an
150    /// associated dictionary of `value_type`.
151    ///
152    /// Dictionary arrays are used to store columns of `value_type`
153    /// that contain many repeated values using less memory, but with
154    /// a higher CPU overhead for some operations.
155    ///
156    /// This type mostly used to represent low cardinality string
157    /// arrays or a limited set of primitive types as integers.
158    ///
159    /// The `bool` value indicates the `Dictionary` is sorted if set to `true`.
160    Dictionary(IntegerType, Box<ArrowDataType>, bool),
161    /// Decimal value with precision and scale
162    /// precision is the number of digits in the number and
163    /// scale is the number of decimal places.
164    /// The number 999.99 has a precision of 5 and scale of 2.
165    Decimal(usize, usize),
166    /// Decimal backed by 32 bits
167    Decimal32(usize, usize),
168    /// Decimal backed by 64 bits
169    Decimal64(usize, usize),
170    /// Decimal backed by 256 bits
171    Decimal256(usize, usize),
172    /// Extension type.
173    Extension(Box<ExtensionType>),
174    /// A binary type that inlines small values
175    /// and can intern bytes.
176    BinaryView,
177    /// A string type that inlines small values
178    /// and can intern strings.
179    Utf8View,
180    /// A type unknown to Arrow.
181    Unknown,
182    /// A nested datatype that can represent slots of differing types.
183    /// Third argument represents mode
184    #[cfg_attr(any(feature = "serde", feature = "dsl-schema"), serde(skip))]
185    Union(Box<UnionType>),
186}
187
188#[derive(Debug, Clone, PartialEq, Eq, Hash)]
189#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
190#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
191pub struct ExtensionType {
192    pub name: PlSmallStr,
193    pub inner: ArrowDataType,
194    pub metadata: Option<PlSmallStr>,
195}
196
197#[derive(Debug, Clone, PartialEq, Eq, Hash)]
198pub struct UnionType {
199    pub fields: Vec<Field>,
200    pub ids: Option<Vec<i32>>,
201    pub mode: UnionMode,
202}
203
204/// Mode of [`ArrowDataType::Union`]
205#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
206#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
207pub enum UnionMode {
208    /// Dense union
209    Dense,
210    /// Sparse union
211    Sparse,
212}
213
214impl UnionMode {
215    /// Constructs a [`UnionMode::Sparse`] if the input bool is true,
216    /// or otherwise constructs a [`UnionMode::Dense`]
217    pub fn sparse(is_sparse: bool) -> Self {
218        if is_sparse { Self::Sparse } else { Self::Dense }
219    }
220
221    /// Returns whether the mode is sparse
222    pub fn is_sparse(&self) -> bool {
223        matches!(self, Self::Sparse)
224    }
225
226    /// Returns whether the mode is dense
227    pub fn is_dense(&self) -> bool {
228        matches!(self, Self::Dense)
229    }
230}
231
232/// The time units defined in Arrow.
233#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
234#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
235#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
236pub enum TimeUnit {
237    /// Time in seconds.
238    Second,
239    /// Time in milliseconds.
240    Millisecond,
241    /// Time in microseconds.
242    Microsecond,
243    /// Time in nanoseconds.
244    Nanosecond,
245}
246
247/// Interval units defined in Arrow
248#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
249#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
250#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
251pub enum IntervalUnit {
252    /// The number of elapsed whole months.
253    YearMonth,
254    /// The number of elapsed days and milliseconds,
255    /// stored as 2 contiguous `i32`
256    DayTime,
257    /// The number of elapsed months (i32), days (i32) and nanoseconds (i64).
258    MonthDayNano,
259}
260
261impl ArrowDataType {
262    /// Polars IdxSize type, dependent on bigidx feature
263    pub const IDX_DTYPE: Self = {
264        #[cfg(not(feature = "bigidx"))]
265        {
266            ArrowDataType::UInt32
267        }
268        #[cfg(feature = "bigidx")]
269        {
270            ArrowDataType::UInt64
271        }
272    };
273
274    /// the [`PhysicalType`] of this [`ArrowDataType`].
275    pub fn to_physical_type(&self) -> PhysicalType {
276        use ArrowDataType::*;
277        match self {
278            Null => PhysicalType::Null,
279            Boolean => PhysicalType::Boolean,
280            Int8 => PhysicalType::Primitive(PrimitiveType::Int8),
281            Int16 => PhysicalType::Primitive(PrimitiveType::Int16),
282            Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => {
283                PhysicalType::Primitive(PrimitiveType::Int32)
284            },
285            Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => {
286                PhysicalType::Primitive(PrimitiveType::Int64)
287            },
288            Decimal(_, _) => PhysicalType::Primitive(PrimitiveType::Int128),
289            Decimal32(_, _) => PhysicalType::Primitive(PrimitiveType::Int32),
290            Decimal64(_, _) => PhysicalType::Primitive(PrimitiveType::Int64),
291            Decimal256(_, _) => PhysicalType::Primitive(PrimitiveType::Int256),
292            UInt8 => PhysicalType::Primitive(PrimitiveType::UInt8),
293            UInt16 => PhysicalType::Primitive(PrimitiveType::UInt16),
294            UInt32 => PhysicalType::Primitive(PrimitiveType::UInt32),
295            UInt64 => PhysicalType::Primitive(PrimitiveType::UInt64),
296            Float16 => PhysicalType::Primitive(PrimitiveType::Float16),
297            Float32 => PhysicalType::Primitive(PrimitiveType::Float32),
298            Float64 => PhysicalType::Primitive(PrimitiveType::Float64),
299            Int128 => PhysicalType::Primitive(PrimitiveType::Int128),
300            Interval(IntervalUnit::DayTime) => PhysicalType::Primitive(PrimitiveType::DaysMs),
301            Interval(IntervalUnit::MonthDayNano) => {
302                PhysicalType::Primitive(PrimitiveType::MonthDayNano)
303            },
304            Binary => PhysicalType::Binary,
305            FixedSizeBinary(_) => PhysicalType::FixedSizeBinary,
306            LargeBinary => PhysicalType::LargeBinary,
307            Utf8 => PhysicalType::Utf8,
308            LargeUtf8 => PhysicalType::LargeUtf8,
309            BinaryView => PhysicalType::BinaryView,
310            Utf8View => PhysicalType::Utf8View,
311            List(_) => PhysicalType::List,
312            FixedSizeList(_, _) => PhysicalType::FixedSizeList,
313            LargeList(_) => PhysicalType::LargeList,
314            Struct(_) => PhysicalType::Struct,
315            Union(_) => PhysicalType::Union,
316            Map(_, _) => PhysicalType::Map,
317            Dictionary(key, _, _) => PhysicalType::Dictionary(*key),
318            Extension(ext) => ext.inner.to_physical_type(),
319            Unknown => unimplemented!(),
320        }
321    }
322
323    // The datatype underlying this (possibly logical) arrow data type.
324    pub fn underlying_physical_type(&self) -> ArrowDataType {
325        use ArrowDataType::*;
326        match self {
327            Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => Int32,
328            Date64
329            | Timestamp(_, _)
330            | Time64(_)
331            | Duration(_)
332            | Interval(IntervalUnit::DayTime) => Int64,
333            Interval(IntervalUnit::MonthDayNano) => unimplemented!(),
334            Binary => Binary,
335            List(field) => List(Box::new(Field {
336                dtype: field.dtype.underlying_physical_type(),
337                ..*field.clone()
338            })),
339            LargeList(field) => LargeList(Box::new(Field {
340                dtype: field.dtype.underlying_physical_type(),
341                ..*field.clone()
342            })),
343            FixedSizeList(field, width) => FixedSizeList(
344                Box::new(Field {
345                    dtype: field.dtype.underlying_physical_type(),
346                    ..*field.clone()
347                }),
348                *width,
349            ),
350            Struct(fields) => Struct(
351                fields
352                    .iter()
353                    .map(|field| Field {
354                        dtype: field.dtype.underlying_physical_type(),
355                        ..field.clone()
356                    })
357                    .collect(),
358            ),
359            Dictionary(keys, _, _) => (*keys).into(),
360            Union(_) => unimplemented!(),
361            Map(_, _) => unimplemented!(),
362            Extension(ext) => ext.inner.underlying_physical_type(),
363            _ => self.clone(),
364        }
365    }
366
367    /// Returns `&self` for all but [`ArrowDataType::Extension`]. For [`ArrowDataType::Extension`],
368    /// (recursively) returns the inner [`ArrowDataType`].
369    /// Never returns the variant [`ArrowDataType::Extension`].
370    pub fn to_logical_type(&self) -> &ArrowDataType {
371        use ArrowDataType::*;
372        match self {
373            Extension(ext) => ext.inner.to_logical_type(),
374            _ => self,
375        }
376    }
377
378    pub fn inner_dtype(&self) -> Option<&ArrowDataType> {
379        match self {
380            ArrowDataType::List(inner) => Some(inner.dtype()),
381            ArrowDataType::LargeList(inner) => Some(inner.dtype()),
382            ArrowDataType::FixedSizeList(inner, _) => Some(inner.dtype()),
383            _ => None,
384        }
385    }
386
387    pub fn is_nested(&self) -> bool {
388        use ArrowDataType as D;
389
390        matches!(
391            self,
392            D::List(_)
393                | D::LargeList(_)
394                | D::FixedSizeList(_, _)
395                | D::Struct(_)
396                | D::Union(_)
397                | D::Map(_, _)
398                | D::Dictionary(_, _, _)
399                | D::Extension(_)
400        )
401    }
402
403    pub fn is_view(&self) -> bool {
404        matches!(self, ArrowDataType::Utf8View | ArrowDataType::BinaryView)
405    }
406
407    pub fn is_numeric(&self) -> bool {
408        use ArrowDataType as D;
409        matches!(
410            self,
411            D::Int8
412                | D::Int16
413                | D::Int32
414                | D::Int64
415                | D::Int128
416                | D::UInt8
417                | D::UInt16
418                | D::UInt32
419                | D::UInt64
420                | D::Float32
421                | D::Float64
422                | D::Decimal(_, _)
423                | D::Decimal32(_, _)
424                | D::Decimal64(_, _)
425                | D::Decimal256(_, _)
426        )
427    }
428
429    pub fn to_fixed_size_list(self, size: usize, is_nullable: bool) -> ArrowDataType {
430        ArrowDataType::FixedSizeList(
431            Box::new(Field::new(LIST_VALUES_NAME, self, is_nullable)),
432            size,
433        )
434    }
435
436    /// Check (recursively) whether datatype contains an [`ArrowDataType::Dictionary`] type.
437    pub fn contains_dictionary(&self) -> bool {
438        use ArrowDataType as D;
439        match self {
440            D::Null
441            | D::Boolean
442            | D::Int8
443            | D::Int16
444            | D::Int32
445            | D::Int64
446            | D::UInt8
447            | D::UInt16
448            | D::UInt32
449            | D::UInt64
450            | D::Int128
451            | D::Float16
452            | D::Float32
453            | D::Float64
454            | D::Timestamp(_, _)
455            | D::Date32
456            | D::Date64
457            | D::Time32(_)
458            | D::Time64(_)
459            | D::Duration(_)
460            | D::Interval(_)
461            | D::Binary
462            | D::FixedSizeBinary(_)
463            | D::LargeBinary
464            | D::Utf8
465            | D::LargeUtf8
466            | D::Decimal(_, _)
467            | D::Decimal32(_, _)
468            | D::Decimal64(_, _)
469            | D::Decimal256(_, _)
470            | D::BinaryView
471            | D::Utf8View
472            | D::Unknown => false,
473            D::List(field)
474            | D::FixedSizeList(field, _)
475            | D::Map(field, _)
476            | D::LargeList(field) => field.dtype().contains_dictionary(),
477            D::Struct(fields) => fields.iter().any(|f| f.dtype().contains_dictionary()),
478            D::Union(union) => union.fields.iter().any(|f| f.dtype().contains_dictionary()),
479            D::Dictionary(_, _, _) => true,
480            D::Extension(ext) => ext.inner.contains_dictionary(),
481        }
482    }
483}
484
485impl From<IntegerType> for ArrowDataType {
486    fn from(item: IntegerType) -> Self {
487        match item {
488            IntegerType::Int8 => ArrowDataType::Int8,
489            IntegerType::Int16 => ArrowDataType::Int16,
490            IntegerType::Int32 => ArrowDataType::Int32,
491            IntegerType::Int64 => ArrowDataType::Int64,
492            IntegerType::Int128 => ArrowDataType::Int128,
493            IntegerType::UInt8 => ArrowDataType::UInt8,
494            IntegerType::UInt16 => ArrowDataType::UInt16,
495            IntegerType::UInt32 => ArrowDataType::UInt32,
496            IntegerType::UInt64 => ArrowDataType::UInt64,
497        }
498    }
499}
500
501impl From<PrimitiveType> for ArrowDataType {
502    fn from(item: PrimitiveType) -> Self {
503        match item {
504            PrimitiveType::Int8 => ArrowDataType::Int8,
505            PrimitiveType::Int16 => ArrowDataType::Int16,
506            PrimitiveType::Int32 => ArrowDataType::Int32,
507            PrimitiveType::Int64 => ArrowDataType::Int64,
508            PrimitiveType::UInt8 => ArrowDataType::UInt8,
509            PrimitiveType::UInt16 => ArrowDataType::UInt16,
510            PrimitiveType::UInt32 => ArrowDataType::UInt32,
511            PrimitiveType::UInt64 => ArrowDataType::UInt64,
512            PrimitiveType::Int128 => ArrowDataType::Int128,
513            PrimitiveType::Int256 => ArrowDataType::Decimal256(32, 32),
514            PrimitiveType::Float16 => ArrowDataType::Float16,
515            PrimitiveType::Float32 => ArrowDataType::Float32,
516            PrimitiveType::Float64 => ArrowDataType::Float64,
517            PrimitiveType::DaysMs => ArrowDataType::Interval(IntervalUnit::DayTime),
518            PrimitiveType::MonthDayNano => ArrowDataType::Interval(IntervalUnit::MonthDayNano),
519            PrimitiveType::UInt128 => unimplemented!(),
520        }
521    }
522}
523
524/// typedef for [`Arc<ArrowSchema>`].
525pub type SchemaRef = Arc<ArrowSchema>;
526
527/// support get extension for metadata
528pub fn get_extension(metadata: &Metadata) -> Extension {
529    if let Some(name) = metadata.get(&PlSmallStr::from_static("ARROW:extension:name")) {
530        let metadata = metadata
531            .get(&PlSmallStr::from_static("ARROW:extension:metadata"))
532            .cloned();
533        Some((name.clone(), metadata))
534    } else {
535        None
536    }
537}
538
539#[cfg(not(feature = "bigidx"))]
540pub type IdxArr = super::array::UInt32Array;
541#[cfg(feature = "bigidx")]
542pub type IdxArr = super::array::UInt64Array;