polars_arrow/datatypes/
mod.rs

1//! Contains all metadata, such as [`PhysicalType`], [`ArrowDataType`], [`Field`] and [`ArrowSchema`].
2
3mod field;
4mod physical_type;
5pub mod reshape;
6mod schema;
7
8use std::collections::BTreeMap;
9use std::sync::Arc;
10
11pub use field::{DTYPE_CATEGORICAL, DTYPE_ENUM_VALUES, Field};
12pub use physical_type::*;
13use polars_utils::pl_str::PlSmallStr;
14pub use schema::{ArrowSchema, ArrowSchemaRef};
15#[cfg(feature = "serde")]
16use serde::{Deserialize, Serialize};
17
18/// typedef for [BTreeMap<PlSmallStr, PlSmallStr>] denoting [`Field`]'s and [`ArrowSchema`]'s metadata.
19pub type Metadata = BTreeMap<PlSmallStr, PlSmallStr>;
20/// typedef for [Option<(PlSmallStr, Option<PlSmallStr>)>] descr
21pub(crate) type Extension = Option<(PlSmallStr, Option<PlSmallStr>)>;
22
23/// The set of supported logical types in this crate.
24///
25/// Each variant uniquely identifies a logical type, which define specific semantics to the data
26/// (e.g. how it should be represented).
27/// Each variant has a corresponding [`PhysicalType`], obtained via [`ArrowDataType::to_physical_type`],
28/// which declares the in-memory representation of data.
29/// The [`ArrowDataType::Extension`] is special in that it augments a [`ArrowDataType`] with metadata to support custom types.
30/// Use `to_logical_type` to desugar such type and return its corresponding logical type.
31#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
32#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
33pub enum ArrowDataType {
34    /// Null type
35    #[default]
36    Null,
37    /// `true` and `false`.
38    Boolean,
39    /// An [`i8`]
40    Int8,
41    /// An [`i16`]
42    Int16,
43    /// An [`i32`]
44    Int32,
45    /// An [`i64`]
46    Int64,
47    /// An [`i128`]
48    Int128,
49    /// An [`u8`]
50    UInt8,
51    /// An [`u16`]
52    UInt16,
53    /// An [`u32`]
54    UInt32,
55    /// An [`u64`]
56    UInt64,
57    /// An 16-bit float
58    Float16,
59    /// A [`f32`]
60    Float32,
61    /// A [`f64`]
62    Float64,
63    /// A [`i64`] representing a timestamp measured in [`TimeUnit`] with an optional timezone.
64    ///
65    /// Time is measured as a Unix epoch, counting the seconds from
66    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
67    /// as a 64-bit signed integer.
68    ///
69    /// The time zone is a string indicating the name of a time zone, one of:
70    ///
71    /// * As used in the Olson time zone database (the "tz database" or
72    ///   "tzdata"), such as "America/New_York"
73    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
74    ///
75    /// When the timezone is not specified, the timestamp is considered to have no timezone
76    /// and is represented _as is_
77    Timestamp(TimeUnit, Option<PlSmallStr>),
78    /// An [`i32`] representing the elapsed time since UNIX epoch (1970-01-01)
79    /// in days.
80    Date32,
81    /// An [`i64`] representing the elapsed time since UNIX epoch (1970-01-01)
82    /// in milliseconds. Values are evenly divisible by 86400000.
83    Date64,
84    /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
85    /// Only [`TimeUnit::Second`] and [`TimeUnit::Millisecond`] are supported on this variant.
86    Time32(TimeUnit),
87    /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
88    /// Only [`TimeUnit::Microsecond`] and [`TimeUnit::Nanosecond`] are supported on this variant.
89    Time64(TimeUnit),
90    /// Measure of elapsed time. This elapsed time is a physical duration (i.e. 1s as defined in S.I.)
91    Duration(TimeUnit),
92    /// A "calendar" interval modeling elapsed time that takes into account calendar shifts.
93    /// For example an interval of 1 day may represent more than 24 hours.
94    Interval(IntervalUnit),
95    /// Opaque binary data of variable length whose offsets are represented as [`i32`].
96    Binary,
97    /// Opaque binary data of fixed size.
98    /// Enum parameter specifies the number of bytes per value.
99    FixedSizeBinary(usize),
100    /// Opaque binary data of variable length whose offsets are represented as [`i64`].
101    LargeBinary,
102    /// A variable-length UTF-8 encoded string whose offsets are represented as [`i32`].
103    Utf8,
104    /// A variable-length UTF-8 encoded string whose offsets are represented as [`i64`].
105    LargeUtf8,
106    /// A list of some logical data type whose offsets are represented as [`i32`].
107    List(Box<Field>),
108    /// A list of some logical data type with a fixed number of elements.
109    FixedSizeList(Box<Field>, usize),
110    /// A list of some logical data type whose offsets are represented as [`i64`].
111    LargeList(Box<Field>),
112    /// A nested [`ArrowDataType`] with a given number of [`Field`]s.
113    Struct(Vec<Field>),
114    /// A nested type that is represented as
115    ///
116    /// List<entries: Struct<key: K, value: V>>
117    ///
118    /// In this layout, the keys and values are each respectively contiguous. We do
119    /// not constrain the key and value types, so the application is responsible
120    /// for ensuring that the keys are hashable and unique. Whether the keys are sorted
121    /// may be set in the metadata for this field.
122    ///
123    /// In a field with Map type, the field has a child Struct field, which then
124    /// has two children: key type and the second the value type. The names of the
125    /// child fields may be respectively "entries", "key", and "value", but this is
126    /// not enforced.
127    ///
128    /// Map
129    /// ```text
130    ///   - child[0] entries: Struct
131    ///     - child[0] key: K
132    ///     - child[1] value: V
133    /// ```
134    /// Neither the "entries" field nor the "key" field may be nullable.
135    ///
136    /// The metadata is structured so that Arrow systems without special handling
137    /// for Map can make Map an alias for List. The "layout" attribute for the Map
138    /// field must have the same contents as a List.
139    /// - Field
140    /// - ordered
141    Map(Box<Field>, bool),
142    /// A dictionary encoded array (`key_type`, `value_type`), where
143    /// each array element is an index of `key_type` into an
144    /// associated dictionary of `value_type`.
145    ///
146    /// Dictionary arrays are used to store columns of `value_type`
147    /// that contain many repeated values using less memory, but with
148    /// a higher CPU overhead for some operations.
149    ///
150    /// This type mostly used to represent low cardinality string
151    /// arrays or a limited set of primitive types as integers.
152    ///
153    /// The `bool` value indicates the `Dictionary` is sorted if set to `true`.
154    Dictionary(IntegerType, Box<ArrowDataType>, bool),
155    /// Decimal value with precision and scale
156    /// precision is the number of digits in the number and
157    /// scale is the number of decimal places.
158    /// The number 999.99 has a precision of 5 and scale of 2.
159    Decimal(usize, usize),
160    /// Decimal backed by 256 bits
161    Decimal256(usize, usize),
162    /// Extension type.
163    Extension(Box<ExtensionType>),
164    /// A binary type that inlines small values
165    /// and can intern bytes.
166    BinaryView,
167    /// A string type that inlines small values
168    /// and can intern strings.
169    Utf8View,
170    /// A type unknown to Arrow.
171    Unknown,
172    /// A nested datatype that can represent slots of differing types.
173    /// Third argument represents mode
174    #[cfg_attr(feature = "serde", serde(skip))]
175    Union(Box<UnionType>),
176}
177
178#[derive(Debug, Clone, PartialEq, Eq, Hash)]
179#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
180pub struct ExtensionType {
181    pub name: PlSmallStr,
182    pub inner: ArrowDataType,
183    pub metadata: Option<PlSmallStr>,
184}
185
186#[derive(Debug, Clone, PartialEq, Eq, Hash)]
187pub struct UnionType {
188    pub fields: Vec<Field>,
189    pub ids: Option<Vec<i32>>,
190    pub mode: UnionMode,
191}
192
193/// Mode of [`ArrowDataType::Union`]
194#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
195#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
196pub enum UnionMode {
197    /// Dense union
198    Dense,
199    /// Sparse union
200    Sparse,
201}
202
203impl UnionMode {
204    /// Constructs a [`UnionMode::Sparse`] if the input bool is true,
205    /// or otherwise constructs a [`UnionMode::Dense`]
206    pub fn sparse(is_sparse: bool) -> Self {
207        if is_sparse { Self::Sparse } else { Self::Dense }
208    }
209
210    /// Returns whether the mode is sparse
211    pub fn is_sparse(&self) -> bool {
212        matches!(self, Self::Sparse)
213    }
214
215    /// Returns whether the mode is dense
216    pub fn is_dense(&self) -> bool {
217        matches!(self, Self::Dense)
218    }
219}
220
221/// The time units defined in Arrow.
222#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
223#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
224pub enum TimeUnit {
225    /// Time in seconds.
226    Second,
227    /// Time in milliseconds.
228    Millisecond,
229    /// Time in microseconds.
230    Microsecond,
231    /// Time in nanoseconds.
232    Nanosecond,
233}
234
235/// Interval units defined in Arrow
236#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
237#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
238pub enum IntervalUnit {
239    /// The number of elapsed whole months.
240    YearMonth,
241    /// The number of elapsed days and milliseconds,
242    /// stored as 2 contiguous `i32`
243    DayTime,
244    /// The number of elapsed months (i32), days (i32) and nanoseconds (i64).
245    MonthDayNano,
246}
247
248impl ArrowDataType {
249    /// Polars IdxSize type, dependent on bigidx feature
250    pub const IDX_DTYPE: Self = {
251        #[cfg(not(feature = "bigidx"))]
252        {
253            ArrowDataType::UInt32
254        }
255        #[cfg(feature = "bigidx")]
256        {
257            ArrowDataType::UInt64
258        }
259    };
260
261    /// the [`PhysicalType`] of this [`ArrowDataType`].
262    pub fn to_physical_type(&self) -> PhysicalType {
263        use ArrowDataType::*;
264        match self {
265            Null => PhysicalType::Null,
266            Boolean => PhysicalType::Boolean,
267            Int8 => PhysicalType::Primitive(PrimitiveType::Int8),
268            Int16 => PhysicalType::Primitive(PrimitiveType::Int16),
269            Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => {
270                PhysicalType::Primitive(PrimitiveType::Int32)
271            },
272            Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => {
273                PhysicalType::Primitive(PrimitiveType::Int64)
274            },
275            Decimal(_, _) => PhysicalType::Primitive(PrimitiveType::Int128),
276            Decimal256(_, _) => PhysicalType::Primitive(PrimitiveType::Int256),
277            UInt8 => PhysicalType::Primitive(PrimitiveType::UInt8),
278            UInt16 => PhysicalType::Primitive(PrimitiveType::UInt16),
279            UInt32 => PhysicalType::Primitive(PrimitiveType::UInt32),
280            UInt64 => PhysicalType::Primitive(PrimitiveType::UInt64),
281            Float16 => PhysicalType::Primitive(PrimitiveType::Float16),
282            Float32 => PhysicalType::Primitive(PrimitiveType::Float32),
283            Float64 => PhysicalType::Primitive(PrimitiveType::Float64),
284            Int128 => PhysicalType::Primitive(PrimitiveType::Int128),
285            Interval(IntervalUnit::DayTime) => PhysicalType::Primitive(PrimitiveType::DaysMs),
286            Interval(IntervalUnit::MonthDayNano) => {
287                PhysicalType::Primitive(PrimitiveType::MonthDayNano)
288            },
289            Binary => PhysicalType::Binary,
290            FixedSizeBinary(_) => PhysicalType::FixedSizeBinary,
291            LargeBinary => PhysicalType::LargeBinary,
292            Utf8 => PhysicalType::Utf8,
293            LargeUtf8 => PhysicalType::LargeUtf8,
294            BinaryView => PhysicalType::BinaryView,
295            Utf8View => PhysicalType::Utf8View,
296            List(_) => PhysicalType::List,
297            FixedSizeList(_, _) => PhysicalType::FixedSizeList,
298            LargeList(_) => PhysicalType::LargeList,
299            Struct(_) => PhysicalType::Struct,
300            Union(_) => PhysicalType::Union,
301            Map(_, _) => PhysicalType::Map,
302            Dictionary(key, _, _) => PhysicalType::Dictionary(*key),
303            Extension(ext) => ext.inner.to_physical_type(),
304            Unknown => unimplemented!(),
305        }
306    }
307
308    // The datatype underlying this (possibly logical) arrow data type.
309    pub fn underlying_physical_type(&self) -> ArrowDataType {
310        use ArrowDataType::*;
311        match self {
312            Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => Int32,
313            Date64
314            | Timestamp(_, _)
315            | Time64(_)
316            | Duration(_)
317            | Interval(IntervalUnit::DayTime) => Int64,
318            Interval(IntervalUnit::MonthDayNano) => unimplemented!(),
319            Binary => Binary,
320            List(field) => List(Box::new(Field {
321                dtype: field.dtype.underlying_physical_type(),
322                ..*field.clone()
323            })),
324            LargeList(field) => LargeList(Box::new(Field {
325                dtype: field.dtype.underlying_physical_type(),
326                ..*field.clone()
327            })),
328            FixedSizeList(field, width) => FixedSizeList(
329                Box::new(Field {
330                    dtype: field.dtype.underlying_physical_type(),
331                    ..*field.clone()
332                }),
333                *width,
334            ),
335            Struct(fields) => Struct(
336                fields
337                    .iter()
338                    .map(|field| Field {
339                        dtype: field.dtype.underlying_physical_type(),
340                        ..field.clone()
341                    })
342                    .collect(),
343            ),
344            Dictionary(keys, _, _) => (*keys).into(),
345            Union(_) => unimplemented!(),
346            Map(_, _) => unimplemented!(),
347            Extension(ext) => ext.inner.underlying_physical_type(),
348            _ => self.clone(),
349        }
350    }
351
352    /// Returns `&self` for all but [`ArrowDataType::Extension`]. For [`ArrowDataType::Extension`],
353    /// (recursively) returns the inner [`ArrowDataType`].
354    /// Never returns the variant [`ArrowDataType::Extension`].
355    pub fn to_logical_type(&self) -> &ArrowDataType {
356        use ArrowDataType::*;
357        match self {
358            Extension(ext) => ext.inner.to_logical_type(),
359            _ => self,
360        }
361    }
362
363    pub fn inner_dtype(&self) -> Option<&ArrowDataType> {
364        match self {
365            ArrowDataType::List(inner) => Some(inner.dtype()),
366            ArrowDataType::LargeList(inner) => Some(inner.dtype()),
367            ArrowDataType::FixedSizeList(inner, _) => Some(inner.dtype()),
368            _ => None,
369        }
370    }
371
372    pub fn is_nested(&self) -> bool {
373        use ArrowDataType as D;
374
375        matches!(
376            self,
377            D::List(_)
378                | D::LargeList(_)
379                | D::FixedSizeList(_, _)
380                | D::Struct(_)
381                | D::Union(_)
382                | D::Map(_, _)
383                | D::Dictionary(_, _, _)
384                | D::Extension(_)
385        )
386    }
387
388    pub fn is_view(&self) -> bool {
389        matches!(self, ArrowDataType::Utf8View | ArrowDataType::BinaryView)
390    }
391
392    pub fn is_numeric(&self) -> bool {
393        use ArrowDataType as D;
394        matches!(
395            self,
396            D::Int8
397                | D::Int16
398                | D::Int32
399                | D::Int64
400                | D::Int128
401                | D::UInt8
402                | D::UInt16
403                | D::UInt32
404                | D::UInt64
405                | D::Float32
406                | D::Float64
407                | D::Decimal(_, _)
408                | D::Decimal256(_, _)
409        )
410    }
411
412    pub fn to_fixed_size_list(self, size: usize, is_nullable: bool) -> ArrowDataType {
413        ArrowDataType::FixedSizeList(
414            Box::new(Field::new(
415                PlSmallStr::from_static("item"),
416                self,
417                is_nullable,
418            )),
419            size,
420        )
421    }
422
423    /// Check (recursively) whether datatype contains an [`ArrowDataType::Dictionary`] type.
424    pub fn contains_dictionary(&self) -> bool {
425        use ArrowDataType as D;
426        match self {
427            D::Null
428            | D::Boolean
429            | D::Int8
430            | D::Int16
431            | D::Int32
432            | D::Int64
433            | D::UInt8
434            | D::UInt16
435            | D::UInt32
436            | D::UInt64
437            | D::Int128
438            | D::Float16
439            | D::Float32
440            | D::Float64
441            | D::Timestamp(_, _)
442            | D::Date32
443            | D::Date64
444            | D::Time32(_)
445            | D::Time64(_)
446            | D::Duration(_)
447            | D::Interval(_)
448            | D::Binary
449            | D::FixedSizeBinary(_)
450            | D::LargeBinary
451            | D::Utf8
452            | D::LargeUtf8
453            | D::Decimal(_, _)
454            | D::Decimal256(_, _)
455            | D::BinaryView
456            | D::Utf8View
457            | D::Unknown => false,
458            D::List(field)
459            | D::FixedSizeList(field, _)
460            | D::Map(field, _)
461            | D::LargeList(field) => field.dtype().contains_dictionary(),
462            D::Struct(fields) => fields.iter().any(|f| f.dtype().contains_dictionary()),
463            D::Union(union) => union.fields.iter().any(|f| f.dtype().contains_dictionary()),
464            D::Dictionary(_, _, _) => true,
465            D::Extension(ext) => ext.inner.contains_dictionary(),
466        }
467    }
468}
469
470impl From<IntegerType> for ArrowDataType {
471    fn from(item: IntegerType) -> Self {
472        match item {
473            IntegerType::Int8 => ArrowDataType::Int8,
474            IntegerType::Int16 => ArrowDataType::Int16,
475            IntegerType::Int32 => ArrowDataType::Int32,
476            IntegerType::Int64 => ArrowDataType::Int64,
477            IntegerType::Int128 => ArrowDataType::Int128,
478            IntegerType::UInt8 => ArrowDataType::UInt8,
479            IntegerType::UInt16 => ArrowDataType::UInt16,
480            IntegerType::UInt32 => ArrowDataType::UInt32,
481            IntegerType::UInt64 => ArrowDataType::UInt64,
482        }
483    }
484}
485
486impl From<PrimitiveType> for ArrowDataType {
487    fn from(item: PrimitiveType) -> Self {
488        match item {
489            PrimitiveType::Int8 => ArrowDataType::Int8,
490            PrimitiveType::Int16 => ArrowDataType::Int16,
491            PrimitiveType::Int32 => ArrowDataType::Int32,
492            PrimitiveType::Int64 => ArrowDataType::Int64,
493            PrimitiveType::UInt8 => ArrowDataType::UInt8,
494            PrimitiveType::UInt16 => ArrowDataType::UInt16,
495            PrimitiveType::UInt32 => ArrowDataType::UInt32,
496            PrimitiveType::UInt64 => ArrowDataType::UInt64,
497            PrimitiveType::Int128 => ArrowDataType::Int128,
498            PrimitiveType::Int256 => ArrowDataType::Decimal256(32, 32),
499            PrimitiveType::Float16 => ArrowDataType::Float16,
500            PrimitiveType::Float32 => ArrowDataType::Float32,
501            PrimitiveType::Float64 => ArrowDataType::Float64,
502            PrimitiveType::DaysMs => ArrowDataType::Interval(IntervalUnit::DayTime),
503            PrimitiveType::MonthDayNano => ArrowDataType::Interval(IntervalUnit::MonthDayNano),
504            PrimitiveType::UInt128 => unimplemented!(),
505        }
506    }
507}
508
509/// typedef for [`Arc<ArrowSchema>`].
510pub type SchemaRef = Arc<ArrowSchema>;
511
512/// support get extension for metadata
513pub fn get_extension(metadata: &Metadata) -> Extension {
514    if let Some(name) = metadata.get(&PlSmallStr::from_static("ARROW:extension:name")) {
515        let metadata = metadata
516            .get(&PlSmallStr::from_static("ARROW:extension:metadata"))
517            .cloned();
518        Some((name.clone(), metadata))
519    } else {
520        None
521    }
522}
523
524#[cfg(not(feature = "bigidx"))]
525pub type IdxArr = super::array::UInt32Array;
526#[cfg(feature = "bigidx")]
527pub type IdxArr = super::array::UInt64Array;