polars_arrow/datatypes/
mod.rs

1//! Contains all metadata, such as [`PhysicalType`], [`ArrowDataType`], [`Field`] and [`ArrowSchema`].
2
3mod field;
4mod physical_type;
5pub mod reshape;
6mod schema;
7
8use std::collections::BTreeMap;
9use std::sync::Arc;
10
11pub use field::{
12    DTYPE_CATEGORICAL_LEGACY, DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY,
13    DTYPE_ENUM_VALUES_NEW, Field, MAINTAIN_PL_TYPE, PARQUET_EMPTY_STRUCT, PL_KEY,
14};
15pub use physical_type::*;
16use polars_utils::pl_str::PlSmallStr;
17pub use schema::{ArrowSchema, ArrowSchemaRef};
18#[cfg(feature = "serde")]
19use serde::{Deserialize, Serialize};
20
21use crate::array::LIST_VALUES_NAME;
22
23/// typedef for [BTreeMap<PlSmallStr, PlSmallStr>] denoting [`Field`]'s and [`ArrowSchema`]'s metadata.
24pub type Metadata = BTreeMap<PlSmallStr, PlSmallStr>;
25/// typedef for [Option<(PlSmallStr, Option<PlSmallStr>)>] descr
26pub(crate) type Extension = Option<(PlSmallStr, Option<PlSmallStr>)>;
27
28/// The set of supported logical types in this crate.
29///
30/// Each variant uniquely identifies a logical type, which define specific semantics to the data
31/// (e.g. how it should be represented).
32/// Each variant has a corresponding [`PhysicalType`], obtained via [`ArrowDataType::to_physical_type`],
33/// which declares the in-memory representation of data.
34/// The [`ArrowDataType::Extension`] is special in that it augments a [`ArrowDataType`] with metadata to support custom types.
35/// Use `to_logical_type` to desugar such type and return its corresponding logical type.
36#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
37#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
38#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
39pub enum ArrowDataType {
40    /// Null type
41    #[default]
42    Null,
43    /// `true` and `false`.
44    Boolean,
45    /// An [`i8`]
46    Int8,
47    /// An [`i16`]
48    Int16,
49    /// An [`i32`]
50    Int32,
51    /// An [`i64`]
52    Int64,
53    /// An [`i128`]
54    Int128,
55    /// An [`u8`]
56    UInt8,
57    /// An [`u16`]
58    UInt16,
59    /// An [`u32`]
60    UInt32,
61    /// An [`u64`]
62    UInt64,
63    /// An [`u128`]
64    UInt128,
65    /// An 16-bit float
66    Float16,
67    /// A [`f32`]
68    Float32,
69    /// A [`f64`]
70    Float64,
71    /// A [`i64`] representing a timestamp measured in [`TimeUnit`] with an optional timezone.
72    ///
73    /// Time is measured as a Unix epoch, counting the seconds from
74    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
75    /// as a 64-bit signed integer.
76    ///
77    /// The time zone is a string indicating the name of a time zone, one of:
78    ///
79    /// * As used in the Olson time zone database (the "tz database" or
80    ///   "tzdata"), such as "America/New_York"
81    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
82    ///
83    /// When the timezone is not specified, the timestamp is considered to have no timezone
84    /// and is represented _as is_
85    Timestamp(TimeUnit, Option<PlSmallStr>),
86    /// An [`i32`] representing the elapsed time since UNIX epoch (1970-01-01)
87    /// in days.
88    Date32,
89    /// An [`i64`] representing the elapsed time since UNIX epoch (1970-01-01)
90    /// in milliseconds. Values are evenly divisible by 86400000.
91    Date64,
92    /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
93    /// Only [`TimeUnit::Second`] and [`TimeUnit::Millisecond`] are supported on this variant.
94    Time32(TimeUnit),
95    /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
96    /// Only [`TimeUnit::Microsecond`] and [`TimeUnit::Nanosecond`] are supported on this variant.
97    Time64(TimeUnit),
98    /// Measure of elapsed time. This elapsed time is a physical duration (i.e. 1s as defined in S.I.)
99    Duration(TimeUnit),
100    /// A "calendar" interval modeling elapsed time that takes into account calendar shifts.
101    /// For example an interval of 1 day may represent more than 24 hours.
102    Interval(IntervalUnit),
103    /// Opaque binary data of variable length whose offsets are represented as [`i32`].
104    Binary,
105    /// Opaque binary data of fixed size.
106    /// Enum parameter specifies the number of bytes per value.
107    FixedSizeBinary(usize),
108    /// Opaque binary data of variable length whose offsets are represented as [`i64`].
109    LargeBinary,
110    /// A variable-length UTF-8 encoded string whose offsets are represented as [`i32`].
111    Utf8,
112    /// A variable-length UTF-8 encoded string whose offsets are represented as [`i64`].
113    LargeUtf8,
114    /// A list of some logical data type whose offsets are represented as [`i32`].
115    List(Box<Field>),
116    /// A list of some logical data type with a fixed number of elements.
117    FixedSizeList(Box<Field>, usize),
118    /// A list of some logical data type whose offsets are represented as [`i64`].
119    LargeList(Box<Field>),
120    /// A nested [`ArrowDataType`] with a given number of [`Field`]s.
121    Struct(Vec<Field>),
122    /// A nested type that is represented as
123    ///
124    /// List<entries: Struct<key: K, value: V>>
125    ///
126    /// In this layout, the keys and values are each respectively contiguous. We do
127    /// not constrain the key and value types, so the application is responsible
128    /// for ensuring that the keys are hashable and unique. Whether the keys are sorted
129    /// may be set in the metadata for this field.
130    ///
131    /// In a field with Map type, the field has a child Struct field, which then
132    /// has two children: key type and the second the value type. The names of the
133    /// child fields may be respectively "entries", "key", and "value", but this is
134    /// not enforced.
135    ///
136    /// Map
137    /// ```text
138    ///   - child[0] entries: Struct
139    ///     - child[0] key: K
140    ///     - child[1] value: V
141    /// ```
142    /// Neither the "entries" field nor the "key" field may be nullable.
143    ///
144    /// The metadata is structured so that Arrow systems without special handling
145    /// for Map can make Map an alias for List. The "layout" attribute for the Map
146    /// field must have the same contents as a List.
147    /// - Field
148    /// - ordered
149    Map(Box<Field>, bool),
150    /// A dictionary encoded array (`key_type`, `value_type`), where
151    /// each array element is an index of `key_type` into an
152    /// associated dictionary of `value_type`.
153    ///
154    /// Dictionary arrays are used to store columns of `value_type`
155    /// that contain many repeated values using less memory, but with
156    /// a higher CPU overhead for some operations.
157    ///
158    /// This type mostly used to represent low cardinality string
159    /// arrays or a limited set of primitive types as integers.
160    ///
161    /// The `bool` value indicates the `Dictionary` is sorted if set to `true`.
162    Dictionary(IntegerType, Box<ArrowDataType>, bool),
163    /// Decimal value with precision and scale
164    /// precision is the number of digits in the number and
165    /// scale is the number of decimal places.
166    /// The number 999.99 has a precision of 5 and scale of 2.
167    Decimal(usize, usize),
168    /// Decimal backed by 32 bits
169    Decimal32(usize, usize),
170    /// Decimal backed by 64 bits
171    Decimal64(usize, usize),
172    /// Decimal backed by 256 bits
173    Decimal256(usize, usize),
174    /// Extension type.
175    Extension(Box<ExtensionType>),
176    /// A binary type that inlines small values
177    /// and can intern bytes.
178    BinaryView,
179    /// A string type that inlines small values
180    /// and can intern strings.
181    Utf8View,
182    /// A type unknown to Arrow.
183    Unknown,
184    /// A nested datatype that can represent slots of differing types.
185    /// Third argument represents mode
186    #[cfg_attr(any(feature = "serde", feature = "dsl-schema"), serde(skip))]
187    Union(Box<UnionType>),
188}
189
190#[derive(Debug, Clone, PartialEq, Eq, Hash)]
191#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
192#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
193pub struct ExtensionType {
194    pub name: PlSmallStr,
195    pub inner: ArrowDataType,
196    pub metadata: Option<PlSmallStr>,
197}
198
199#[derive(Debug, Clone, PartialEq, Eq, Hash)]
200pub struct UnionType {
201    pub fields: Vec<Field>,
202    pub ids: Option<Vec<i32>>,
203    pub mode: UnionMode,
204}
205
206/// Mode of [`ArrowDataType::Union`]
207#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
208#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
209pub enum UnionMode {
210    /// Dense union
211    Dense,
212    /// Sparse union
213    Sparse,
214}
215
216impl UnionMode {
217    /// Constructs a [`UnionMode::Sparse`] if the input bool is true,
218    /// or otherwise constructs a [`UnionMode::Dense`]
219    pub fn sparse(is_sparse: bool) -> Self {
220        if is_sparse { Self::Sparse } else { Self::Dense }
221    }
222
223    /// Returns whether the mode is sparse
224    pub fn is_sparse(&self) -> bool {
225        matches!(self, Self::Sparse)
226    }
227
228    /// Returns whether the mode is dense
229    pub fn is_dense(&self) -> bool {
230        matches!(self, Self::Dense)
231    }
232}
233
234/// The time units defined in Arrow.
235#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
236#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
237#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
238pub enum TimeUnit {
239    /// Time in seconds.
240    Second,
241    /// Time in milliseconds.
242    Millisecond,
243    /// Time in microseconds.
244    Microsecond,
245    /// Time in nanoseconds.
246    Nanosecond,
247}
248
249/// Interval units defined in Arrow
250#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
251#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
252#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
253pub enum IntervalUnit {
254    /// The number of elapsed whole months.
255    YearMonth,
256    /// The number of elapsed days and milliseconds,
257    /// stored as 2 contiguous `i32`
258    DayTime,
259    /// The number of elapsed months (i32), days (i32) and nanoseconds (i64).
260    MonthDayNano,
261    /// `(months: i32, days: i32, milliseconds: i32)`.
262    /// Used when loading the Parquet INTERVAL type. This is expected to be
263    /// unreachable outside of Parquet reading.
264    MonthDayMillis,
265}
266
267impl ArrowDataType {
268    /// Polars IdxSize type, dependent on bigidx feature
269    pub const IDX_DTYPE: Self = {
270        #[cfg(not(feature = "bigidx"))]
271        {
272            ArrowDataType::UInt32
273        }
274        #[cfg(feature = "bigidx")]
275        {
276            ArrowDataType::UInt64
277        }
278    };
279
280    /// the [`PhysicalType`] of this [`ArrowDataType`].
281    pub fn to_physical_type(&self) -> PhysicalType {
282        use ArrowDataType::*;
283        match self {
284            Null => PhysicalType::Null,
285            Boolean => PhysicalType::Boolean,
286            Int8 => PhysicalType::Primitive(PrimitiveType::Int8),
287            Int16 => PhysicalType::Primitive(PrimitiveType::Int16),
288            Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => {
289                PhysicalType::Primitive(PrimitiveType::Int32)
290            },
291            Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => {
292                PhysicalType::Primitive(PrimitiveType::Int64)
293            },
294            Int128 => PhysicalType::Primitive(PrimitiveType::Int128),
295            Decimal(_, _) => PhysicalType::Primitive(PrimitiveType::Int128),
296            Decimal32(_, _) => PhysicalType::Primitive(PrimitiveType::Int32),
297            Decimal64(_, _) => PhysicalType::Primitive(PrimitiveType::Int64),
298            Decimal256(_, _) => PhysicalType::Primitive(PrimitiveType::Int256),
299            UInt8 => PhysicalType::Primitive(PrimitiveType::UInt8),
300            UInt16 => PhysicalType::Primitive(PrimitiveType::UInt16),
301            UInt32 => PhysicalType::Primitive(PrimitiveType::UInt32),
302            UInt64 => PhysicalType::Primitive(PrimitiveType::UInt64),
303            UInt128 => PhysicalType::Primitive(PrimitiveType::UInt128),
304            Float16 => PhysicalType::Primitive(PrimitiveType::Float16),
305            Float32 => PhysicalType::Primitive(PrimitiveType::Float32),
306            Float64 => PhysicalType::Primitive(PrimitiveType::Float64),
307            Interval(IntervalUnit::DayTime) => PhysicalType::Primitive(PrimitiveType::DaysMs),
308            Interval(IntervalUnit::MonthDayNano) => {
309                PhysicalType::Primitive(PrimitiveType::MonthDayNano)
310            },
311            Interval(IntervalUnit::MonthDayMillis) => {
312                PhysicalType::Primitive(PrimitiveType::MonthDayMillis)
313            },
314            Binary => PhysicalType::Binary,
315            FixedSizeBinary(_) => PhysicalType::FixedSizeBinary,
316            LargeBinary => PhysicalType::LargeBinary,
317            Utf8 => PhysicalType::Utf8,
318            LargeUtf8 => PhysicalType::LargeUtf8,
319            BinaryView => PhysicalType::BinaryView,
320            Utf8View => PhysicalType::Utf8View,
321            List(_) => PhysicalType::List,
322            FixedSizeList(_, _) => PhysicalType::FixedSizeList,
323            LargeList(_) => PhysicalType::LargeList,
324            Struct(_) => PhysicalType::Struct,
325            Union(_) => PhysicalType::Union,
326            Map(_, _) => PhysicalType::Map,
327            Dictionary(key, _, _) => PhysicalType::Dictionary(*key),
328            Extension(ext) => ext.inner.to_physical_type(),
329            Unknown => unimplemented!(),
330        }
331    }
332
333    // The datatype underlying this (possibly logical) arrow data type.
334    pub fn underlying_physical_type(&self) -> ArrowDataType {
335        use ArrowDataType::*;
336        match self {
337            Decimal32(_, _) | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => Int32,
338            Decimal64(_, _)
339            | Date64
340            | Timestamp(_, _)
341            | Time64(_)
342            | Duration(_)
343            | Interval(IntervalUnit::DayTime) => Int64,
344            Interval(IntervalUnit::MonthDayNano) => unimplemented!(),
345            Binary => Binary,
346            Decimal(_, _) => Int128,
347            Decimal256(_, _) => unimplemented!(),
348            List(field) => List(Box::new(Field {
349                dtype: field.dtype.underlying_physical_type(),
350                ..*field.clone()
351            })),
352            LargeList(field) => LargeList(Box::new(Field {
353                dtype: field.dtype.underlying_physical_type(),
354                ..*field.clone()
355            })),
356            FixedSizeList(field, width) => FixedSizeList(
357                Box::new(Field {
358                    dtype: field.dtype.underlying_physical_type(),
359                    ..*field.clone()
360                }),
361                *width,
362            ),
363            Struct(fields) => Struct(
364                fields
365                    .iter()
366                    .map(|field| Field {
367                        dtype: field.dtype.underlying_physical_type(),
368                        ..field.clone()
369                    })
370                    .collect(),
371            ),
372            Dictionary(keys, _, _) => (*keys).into(),
373            Union(_) => unimplemented!(),
374            Map(_, _) => unimplemented!(),
375            Extension(ext) => ext.inner.underlying_physical_type(),
376            _ => self.clone(),
377        }
378    }
379
380    /// Returns `&self` for all but [`ArrowDataType::Extension`]. For [`ArrowDataType::Extension`],
381    /// (recursively) returns the inner [`ArrowDataType`].
382    /// Never returns the variant [`ArrowDataType::Extension`].
383    pub fn to_logical_type(&self) -> &ArrowDataType {
384        use ArrowDataType::*;
385        match self {
386            Extension(ext) => ext.inner.to_logical_type(),
387            _ => self,
388        }
389    }
390
391    pub fn inner_dtype(&self) -> Option<&ArrowDataType> {
392        match self {
393            ArrowDataType::List(inner) => Some(inner.dtype()),
394            ArrowDataType::LargeList(inner) => Some(inner.dtype()),
395            ArrowDataType::FixedSizeList(inner, _) => Some(inner.dtype()),
396            _ => None,
397        }
398    }
399
400    pub fn is_nested(&self) -> bool {
401        use ArrowDataType as D;
402
403        matches!(
404            self,
405            D::List(_)
406                | D::LargeList(_)
407                | D::FixedSizeList(_, _)
408                | D::Struct(_)
409                | D::Union(_)
410                | D::Map(_, _)
411                | D::Dictionary(_, _, _)
412                | D::Extension(_)
413        )
414    }
415
416    pub fn is_view(&self) -> bool {
417        matches!(self, ArrowDataType::Utf8View | ArrowDataType::BinaryView)
418    }
419
420    pub fn is_numeric(&self) -> bool {
421        use ArrowDataType as D;
422        matches!(
423            self,
424            D::Int8
425                | D::Int16
426                | D::Int32
427                | D::Int64
428                | D::Int128
429                | D::UInt8
430                | D::UInt16
431                | D::UInt32
432                | D::UInt64
433                | D::UInt128
434                | D::Float32
435                | D::Float64
436                | D::Decimal(_, _)
437                | D::Decimal32(_, _)
438                | D::Decimal64(_, _)
439                | D::Decimal256(_, _)
440        )
441    }
442
443    pub fn to_large_list(self, is_nullable: bool) -> ArrowDataType {
444        ArrowDataType::LargeList(Box::new(Field::new(LIST_VALUES_NAME, self, is_nullable)))
445    }
446
447    pub fn to_fixed_size_list(self, size: usize, is_nullable: bool) -> ArrowDataType {
448        ArrowDataType::FixedSizeList(
449            Box::new(Field::new(LIST_VALUES_NAME, self, is_nullable)),
450            size,
451        )
452    }
453
454    /// Check (recursively) whether datatype contains an [`ArrowDataType::Dictionary`] type.
455    pub fn contains_dictionary(&self) -> bool {
456        use ArrowDataType as D;
457        match self {
458            D::Null
459            | D::Boolean
460            | D::Int8
461            | D::Int16
462            | D::Int32
463            | D::Int64
464            | D::Int128
465            | D::UInt8
466            | D::UInt16
467            | D::UInt32
468            | D::UInt64
469            | D::UInt128
470            | D::Float16
471            | D::Float32
472            | D::Float64
473            | D::Timestamp(_, _)
474            | D::Date32
475            | D::Date64
476            | D::Time32(_)
477            | D::Time64(_)
478            | D::Duration(_)
479            | D::Interval(_)
480            | D::Binary
481            | D::FixedSizeBinary(_)
482            | D::LargeBinary
483            | D::Utf8
484            | D::LargeUtf8
485            | D::Decimal(_, _)
486            | D::Decimal32(_, _)
487            | D::Decimal64(_, _)
488            | D::Decimal256(_, _)
489            | D::BinaryView
490            | D::Utf8View
491            | D::Unknown => false,
492            D::List(field)
493            | D::FixedSizeList(field, _)
494            | D::Map(field, _)
495            | D::LargeList(field) => field.dtype().contains_dictionary(),
496            D::Struct(fields) => fields.iter().any(|f| f.dtype().contains_dictionary()),
497            D::Union(union) => union.fields.iter().any(|f| f.dtype().contains_dictionary()),
498            D::Dictionary(_, _, _) => true,
499            D::Extension(ext) => ext.inner.contains_dictionary(),
500        }
501    }
502}
503
504impl From<IntegerType> for ArrowDataType {
505    fn from(item: IntegerType) -> Self {
506        match item {
507            IntegerType::Int8 => ArrowDataType::Int8,
508            IntegerType::Int16 => ArrowDataType::Int16,
509            IntegerType::Int32 => ArrowDataType::Int32,
510            IntegerType::Int64 => ArrowDataType::Int64,
511            IntegerType::Int128 => ArrowDataType::Int128,
512            IntegerType::UInt8 => ArrowDataType::UInt8,
513            IntegerType::UInt16 => ArrowDataType::UInt16,
514            IntegerType::UInt32 => ArrowDataType::UInt32,
515            IntegerType::UInt64 => ArrowDataType::UInt64,
516            IntegerType::UInt128 => ArrowDataType::UInt128,
517        }
518    }
519}
520
521impl From<PrimitiveType> for ArrowDataType {
522    fn from(item: PrimitiveType) -> Self {
523        match item {
524            PrimitiveType::Int8 => ArrowDataType::Int8,
525            PrimitiveType::Int16 => ArrowDataType::Int16,
526            PrimitiveType::Int32 => ArrowDataType::Int32,
527            PrimitiveType::Int64 => ArrowDataType::Int64,
528            PrimitiveType::Int128 => ArrowDataType::Int128,
529            PrimitiveType::UInt8 => ArrowDataType::UInt8,
530            PrimitiveType::UInt16 => ArrowDataType::UInt16,
531            PrimitiveType::UInt32 => ArrowDataType::UInt32,
532            PrimitiveType::UInt64 => ArrowDataType::UInt64,
533            PrimitiveType::UInt128 => ArrowDataType::UInt128,
534            PrimitiveType::Int256 => ArrowDataType::Decimal256(32, 32),
535            PrimitiveType::Float16 => ArrowDataType::Float16,
536            PrimitiveType::Float32 => ArrowDataType::Float32,
537            PrimitiveType::Float64 => ArrowDataType::Float64,
538            PrimitiveType::DaysMs => ArrowDataType::Interval(IntervalUnit::DayTime),
539            PrimitiveType::MonthDayNano => ArrowDataType::Interval(IntervalUnit::MonthDayNano),
540            PrimitiveType::MonthDayMillis => ArrowDataType::Interval(IntervalUnit::MonthDayMillis),
541        }
542    }
543}
544
545/// typedef for [`Arc<ArrowSchema>`].
546pub type SchemaRef = Arc<ArrowSchema>;
547
548/// support get extension for metadata
549pub fn get_extension(metadata: &Metadata) -> Extension {
550    if let Some(name) = metadata.get(&PlSmallStr::from_static("ARROW:extension:name")) {
551        let metadata = metadata
552            .get(&PlSmallStr::from_static("ARROW:extension:metadata"))
553            .cloned();
554        Some((name.clone(), metadata))
555    } else {
556        None
557    }
558}
559
560#[cfg(not(feature = "bigidx"))]
561pub type IdxArr = super::array::UInt32Array;
562#[cfg(feature = "bigidx")]
563pub type IdxArr = super::array::UInt64Array;