Skip to main content

datafusion_common/types/
native.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use super::{
19    LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields,
20    TypeSignature,
21};
22use crate::error::{_internal_err, Result};
23use arrow::compute::can_cast_types;
24use arrow::datatypes::{
25    DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType,
26    Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
27};
28use std::{fmt::Display, sync::Arc};
29
30/// Representation of a type that DataFusion can handle natively. It is a subset
31/// of the physical variants in Arrow's native [`DataType`].
32#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
33pub enum NativeType {
34    /// Null type
35    Null,
36    /// A boolean type representing the values `true` and `false`.
37    Boolean,
38    /// A signed 8-bit integer.
39    Int8,
40    /// A signed 16-bit integer.
41    Int16,
42    /// A signed 32-bit integer.
43    Int32,
44    /// A signed 64-bit integer.
45    Int64,
46    /// An unsigned 8-bit integer.
47    UInt8,
48    /// An unsigned 16-bit integer.
49    UInt16,
50    /// An unsigned 32-bit integer.
51    UInt32,
52    /// An unsigned 64-bit integer.
53    UInt64,
54    /// A 16-bit floating point number.
55    Float16,
56    /// A 32-bit floating point number.
57    Float32,
58    /// A 64-bit floating point number.
59    Float64,
60    /// A timestamp with an optional timezone.
61    ///
62    /// Time is measured as a Unix epoch, counting the seconds from
63    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
64    /// as a signed 64-bit integer.
65    ///
66    /// The time zone is a string indicating the name of a time zone, one of:
67    ///
68    /// * As used in the Olson time zone database (the "tz database" or
69    ///   "tzdata"), such as "America/New_York"
70    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
71    ///
72    /// Timestamps with a non-empty timezone
73    /// ------------------------------------
74    ///
75    /// If a Timestamp column has a non-empty timezone value, its epoch is
76    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
77    /// (the Unix epoch), regardless of the Timestamp's own timezone.
78    ///
79    /// Therefore, timestamp values with a non-empty timezone correspond to
80    /// physical points in time together with some additional information about
81    /// how the data was obtained and/or how to display it (the timezone).
82    ///
83    ///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
84    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
85    ///   application may prefer to display it as "January 1st 1970, 01h00" in
86    ///   the Europe/Paris timezone (which is the same physical point in time).
87    ///
88    /// One consequence is that timestamp values with a non-empty timezone
89    /// can be compared and ordered directly, since they all share the same
90    /// well-known point of reference (the Unix epoch).
91    ///
92    /// Timestamps with an unset / empty timezone
93    /// -----------------------------------------
94    ///
95    /// If a Timestamp column has no timezone value, its epoch is
96    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
97    ///
98    /// Therefore, timestamp values without a timezone cannot be meaningfully
99    /// interpreted as physical points in time, but only as calendar / clock
100    /// indications ("wall clock time") in an unspecified timezone.
101    ///
102    ///   For example, the timestamp value 0 with an empty timezone string
103    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
104    ///   is not enough information to interpret it as a well-defined physical
105    ///   point in time.
106    ///
107    /// One consequence is that timestamp values without a timezone cannot
108    /// be reliably compared or ordered, since they may have different points of
109    /// reference.  In particular, it is *not* possible to interpret an unset
110    /// or empty timezone as the same as "UTC".
111    ///
112    /// Conversion between timezones
113    /// ----------------------------
114    ///
115    /// If a Timestamp column has a non-empty timezone, changing the timezone
116    /// to a different non-empty value is a metadata-only operation:
117    /// the timestamp values need not change as their point of reference remains
118    /// the same (the Unix epoch).
119    ///
120    /// However, if a Timestamp column has no timezone value, changing it to a
121    /// non-empty value requires to think about the desired semantics.
122    /// One possibility is to assume that the original timestamp values are
123    /// relative to the epoch of the timezone being set; timestamp values should
124    /// then adjusted to the Unix epoch (for example, changing the timezone from
125    /// empty to "Europe/Paris" would require converting the timestamp values
126    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
127    /// nevertheless correct).
128    ///
129    /// ```
130    /// # use arrow::datatypes::{DataType, TimeUnit};
131    /// DataType::Timestamp(TimeUnit::Second, None);
132    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
133    /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
134    /// ```
135    Timestamp(TimeUnit, Option<Arc<str>>),
136    /// A signed date representing the elapsed time since UNIX epoch (1970-01-01)
137    /// in days.
138    Date,
139    /// A signed time representing the elapsed time since midnight in the unit of `TimeUnit`.
140    Time(TimeUnit),
141    /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
142    Duration(TimeUnit),
143    /// A "calendar" interval which models types that don't necessarily
144    /// have a precise duration without the context of a base timestamp (e.g.
145    /// days can differ in length during day light savings time transitions).
146    Interval(IntervalUnit),
147    /// Opaque binary data of variable length.
148    Binary,
149    /// Opaque binary data of fixed size.
150    /// Enum parameter specifies the number of bytes per value.
151    FixedSizeBinary(i32),
152    /// A variable-length string in Unicode with UTF-8 encoding.
153    String,
154    /// A list of some logical data type with variable length.
155    List(LogicalFieldRef),
156    /// A list of some logical data type with fixed length.
157    FixedSizeList(LogicalFieldRef, i32),
158    /// A nested type that contains a number of sub-fields.
159    Struct(LogicalFields),
160    /// A nested type that can represent slots of differing types.
161    Union(LogicalUnionFields),
162    /// Decimal value with precision and scale
163    ///
164    /// * precision is the total number of digits
165    /// * scale is the number of digits past the decimal
166    ///
167    /// For example the number 123.45 has precision 5 and scale 2.
168    ///
169    /// In certain situations, scale could be negative number. For
170    /// negative scale, it is the number of padding 0 to the right
171    /// of the digits.
172    ///
173    /// For example the number 12300 could be treated as a decimal
174    /// has precision 3 and scale -2.
175    Decimal(u8, i8),
176    /// A Map is a type that an association between a key and a value.
177    ///
178    /// The key and value types are not constrained, but keys should be
179    /// hashable and unique.
180    ///
181    /// In a field with Map type, key type and the second the value type. The names of the
182    /// child fields may be respectively "entries", "key", and "value", but this is
183    /// not enforced.
184    Map(LogicalFieldRef),
185}
186
187impl Display for NativeType {
188    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
189        // Match the format used by arrow::datatypes::DataType's Display impl
190        match self {
191            Self::Null => write!(f, "Null"),
192            Self::Boolean => write!(f, "Boolean"),
193            Self::Int8 => write!(f, "Int8"),
194            Self::Int16 => write!(f, "Int16"),
195            Self::Int32 => write!(f, "Int32"),
196            Self::Int64 => write!(f, "Int64"),
197            Self::UInt8 => write!(f, "UInt8"),
198            Self::UInt16 => write!(f, "UInt16"),
199            Self::UInt32 => write!(f, "UInt32"),
200            Self::UInt64 => write!(f, "UInt64"),
201            Self::Float16 => write!(f, "Float16"),
202            Self::Float32 => write!(f, "Float32"),
203            Self::Float64 => write!(f, "Float64"),
204            Self::Timestamp(unit, Some(tz)) => write!(f, "Timestamp({unit}, {tz:?})"),
205            Self::Timestamp(unit, None) => write!(f, "Timestamp({unit})"),
206            Self::Date => write!(f, "Date"),
207            Self::Time(unit) => write!(f, "Time({unit})"),
208            Self::Duration(unit) => write!(f, "Duration({unit})"),
209            Self::Interval(unit) => write!(f, "Interval({unit:?})"),
210            Self::Binary => write!(f, "Binary"),
211            Self::FixedSizeBinary(size) => write!(f, "FixedSizeBinary({size})"),
212            Self::String => write!(f, "String"),
213            Self::List(field) => write!(f, "List({})", field.logical_type),
214            Self::FixedSizeList(field, size) => {
215                write!(f, "FixedSizeList({size} x {})", field.logical_type)
216            }
217            Self::Struct(fields) => {
218                write!(f, "Struct(")?;
219                for (i, field) in fields.iter().enumerate() {
220                    if i > 0 {
221                        write!(f, ", ")?;
222                    }
223                    write!(f, "{:?}: {}", field.name, field.logical_type)?;
224                }
225                write!(f, ")")
226            }
227            Self::Union(fields) => {
228                write!(f, "Union(")?;
229                for (i, (type_id, field)) in fields.iter().enumerate() {
230                    if i > 0 {
231                        write!(f, ", ")?;
232                    }
233                    write!(f, "{type_id}: ({:?}: {})", field.name, field.logical_type)?;
234                }
235                write!(f, ")")
236            }
237            Self::Decimal(precision, scale) => write!(f, "Decimal({precision}, {scale})"),
238            Self::Map(field) => write!(f, "Map({})", field.logical_type),
239        }
240    }
241}
242
243impl LogicalType for NativeType {
244    fn native(&self) -> &NativeType {
245        self
246    }
247
248    fn signature(&self) -> TypeSignature<'_> {
249        TypeSignature::Native(self)
250    }
251
252    /// Returns the default casted type for the given arrow type
253    ///
254    /// For types like String or Date, multiple arrow types mapped to the same logical type
255    /// If the given arrow type is one of them, we return the same type
256    /// Otherwise, we define the default casted type for the given arrow type
257    fn default_cast_for(&self, origin: &DataType) -> Result<DataType> {
258        use DataType::*;
259
260        fn default_field_cast(to: &LogicalField, from: &Field) -> Result<FieldRef> {
261            Ok(Arc::new(Field::new(
262                to.name.clone(),
263                to.logical_type.default_cast_for(from.data_type())?,
264                to.nullable,
265            )))
266        }
267
268        Ok(match (self, origin) {
269            (Self::Null, _) => Null,
270            (Self::Boolean, _) => Boolean,
271            (Self::Int8, _) => Int8,
272            (Self::Int16, _) => Int16,
273            (Self::Int32, _) => Int32,
274            (Self::Int64, _) => Int64,
275            (Self::UInt8, _) => UInt8,
276            (Self::UInt16, _) => UInt16,
277            (Self::UInt32, _) => UInt32,
278            (Self::UInt64, _) => UInt64,
279            (Self::Float16, _) => Float16,
280            (Self::Float32, _) => Float32,
281            (Self::Float64, _) => Float64,
282            (Self::Decimal(p, s), _) if *p <= DECIMAL32_MAX_PRECISION => {
283                Decimal32(*p, *s)
284            }
285            (Self::Decimal(p, s), _) if *p <= DECIMAL64_MAX_PRECISION => {
286                Decimal64(*p, *s)
287            }
288            (Self::Decimal(p, s), _) if *p <= DECIMAL128_MAX_PRECISION => {
289                Decimal128(*p, *s)
290            }
291            (Self::Decimal(p, s), _) => Decimal256(*p, *s),
292            (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
293            // If given type is Date, return the same type
294            (Self::Date, Date32 | Date64) => origin.to_owned(),
295            (Self::Date, _) => Date32,
296            (Self::Time(tu), _) => match tu {
297                TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
298                TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu),
299            },
300            (Self::Duration(tu), _) => Duration(*tu),
301            (Self::Interval(iu), _) => Interval(*iu),
302            (Self::Binary, LargeUtf8) => LargeBinary,
303            (Self::Binary, Utf8View) => BinaryView,
304            // We don't cast to another kind of binary type if the origin one is already a binary type
305            (Self::Binary, Binary | LargeBinary | BinaryView) => origin.to_owned(),
306            (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => {
307                BinaryView
308            }
309            (Self::Binary, data_type) if can_cast_types(data_type, &LargeBinary) => {
310                LargeBinary
311            }
312            (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary,
313            (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size),
314            (Self::String, LargeBinary) => LargeUtf8,
315            (Self::String, BinaryView) => Utf8View,
316            // We don't cast to another kind of string type if the origin one is already a string type
317            (Self::String, Utf8 | LargeUtf8 | Utf8View) => origin.to_owned(),
318            (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View,
319            (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => {
320                LargeUtf8
321            }
322            (Self::String, data_type) if can_cast_types(data_type, &Utf8) => Utf8,
323            (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => {
324                List(default_field_cast(to_field, from_field)?)
325            }
326            (Self::List(to_field), LargeList(from_field)) => {
327                LargeList(default_field_cast(to_field, from_field)?)
328            }
329            (Self::List(to_field), ListView(from_field)) => {
330                ListView(default_field_cast(to_field, from_field)?)
331            }
332            (Self::List(to_field), LargeListView(from_field)) => {
333                LargeListView(default_field_cast(to_field, from_field)?)
334            }
335            // List array where each element is a len 1 list of the origin type
336            (Self::List(field), _) => List(Arc::new(Field::new(
337                field.name.clone(),
338                field.logical_type.default_cast_for(origin)?,
339                field.nullable,
340            ))),
341            (
342                Self::FixedSizeList(to_field, to_size),
343                FixedSizeList(from_field, from_size),
344            ) if from_size == to_size => {
345                FixedSizeList(default_field_cast(to_field, from_field)?, *to_size)
346            }
347            (
348                Self::FixedSizeList(to_field, size),
349                List(from_field)
350                | LargeList(from_field)
351                | ListView(from_field)
352                | LargeListView(from_field),
353            ) => FixedSizeList(default_field_cast(to_field, from_field)?, *size),
354            // FixedSizeList array where each element is a len 1 list of the origin type
355            (Self::FixedSizeList(field, size), _) => FixedSizeList(
356                Arc::new(Field::new(
357                    field.name.clone(),
358                    field.logical_type.default_cast_for(origin)?,
359                    field.nullable,
360                )),
361                *size,
362            ),
363            // From https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196
364            (Self::Struct(to_fields), Struct(from_fields))
365                if from_fields.len() == to_fields.len() =>
366            {
367                Struct(
368                    from_fields
369                        .iter()
370                        .zip(to_fields.iter())
371                        .map(|(from, to)| default_field_cast(to, from))
372                        .collect::<Result<Fields>>()?,
373                )
374            }
375            (Self::Struct(to_fields), Null) => Struct(
376                to_fields
377                    .iter()
378                    .map(|field| {
379                        Ok(Arc::new(Field::new(
380                            field.name.clone(),
381                            field.logical_type.default_cast_for(&Null)?,
382                            field.nullable,
383                        )))
384                    })
385                    .collect::<Result<Fields>>()?,
386            ),
387            (Self::Map(to_field), Map(from_field, sorted)) => {
388                Map(default_field_cast(to_field, from_field)?, *sorted)
389            }
390            (Self::Map(field), Null) => Map(
391                Arc::new(Field::new(
392                    field.name.clone(),
393                    field.logical_type.default_cast_for(&Null)?,
394                    field.nullable,
395                )),
396                false,
397            ),
398            (Self::Union(to_fields), Union(from_fields, mode))
399                if from_fields.len() == to_fields.len() =>
400            {
401                Union(
402                    from_fields
403                        .iter()
404                        .zip(to_fields.iter())
405                        .map(|((_, from), (i, to))| {
406                            Ok((*i, default_field_cast(to, from)?))
407                        })
408                        .collect::<Result<UnionFields>>()?,
409                    *mode,
410                )
411            }
412            _ => {
413                return _internal_err!(
414                    "Unavailable default cast for native type {} from physical type {}",
415                    self,
416                    origin
417                );
418            }
419        })
420    }
421}
422
423// The following From<DataType>, From<Field>, ... implementations are temporary
424// mapping solutions to provide backwards compatibility while transitioning from
425// the purely physical system to a logical / physical system.
426
427impl From<&DataType> for NativeType {
428    fn from(value: &DataType) -> Self {
429        value.clone().into()
430    }
431}
432
433impl From<DataType> for NativeType {
434    fn from(value: DataType) -> Self {
435        use NativeType::*;
436        match value {
437            DataType::Null => Null,
438            DataType::Boolean => Boolean,
439            DataType::Int8 => Int8,
440            DataType::Int16 => Int16,
441            DataType::Int32 => Int32,
442            DataType::Int64 => Int64,
443            DataType::UInt8 => UInt8,
444            DataType::UInt16 => UInt16,
445            DataType::UInt32 => UInt32,
446            DataType::UInt64 => UInt64,
447            DataType::Float16 => Float16,
448            DataType::Float32 => Float32,
449            DataType::Float64 => Float64,
450            DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
451            DataType::Date32 | DataType::Date64 => Date,
452            DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
453            DataType::Duration(tu) => Duration(tu),
454            DataType::Interval(iu) => Interval(iu),
455            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary,
456            DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
457            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => String,
458            DataType::List(field)
459            | DataType::ListView(field)
460            | DataType::LargeList(field)
461            | DataType::LargeListView(field) => List(Arc::new(field.as_ref().into())),
462            DataType::FixedSizeList(field, size) => {
463                FixedSizeList(Arc::new(field.as_ref().into()), size)
464            }
465            DataType::Struct(fields) => Struct(LogicalFields::from(&fields)),
466            DataType::Union(union_fields, _) => {
467                Union(LogicalUnionFields::from(&union_fields))
468            }
469            DataType::Decimal32(p, s)
470            | DataType::Decimal64(p, s)
471            | DataType::Decimal128(p, s)
472            | DataType::Decimal256(p, s) => Decimal(p, s),
473            DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
474            DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
475            DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),
476        }
477    }
478}
479
480impl NativeType {
481    #[inline]
482    pub fn is_numeric(&self) -> bool {
483        self.is_integer() || self.is_float() || self.is_decimal()
484    }
485
486    #[inline]
487    pub fn is_integer(&self) -> bool {
488        use NativeType::*;
489        matches!(
490            self,
491            UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
492        )
493    }
494
495    #[inline]
496    pub fn is_timestamp(&self) -> bool {
497        matches!(self, NativeType::Timestamp(_, _))
498    }
499
500    #[inline]
501    pub fn is_date(&self) -> bool {
502        *self == NativeType::Date
503    }
504
505    #[inline]
506    pub fn is_time(&self) -> bool {
507        matches!(self, NativeType::Time(_))
508    }
509
510    #[inline]
511    pub fn is_interval(&self) -> bool {
512        matches!(self, NativeType::Interval(_))
513    }
514
515    #[inline]
516    pub fn is_duration(&self) -> bool {
517        matches!(self, NativeType::Duration(_))
518    }
519
520    #[inline]
521    pub fn is_binary(&self) -> bool {
522        matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_))
523    }
524
525    #[inline]
526    pub fn is_null(&self) -> bool {
527        *self == NativeType::Null
528    }
529
530    #[inline]
531    pub fn is_decimal(&self) -> bool {
532        matches!(self, Self::Decimal(_, _))
533    }
534
535    #[inline]
536    pub fn is_float(&self) -> bool {
537        matches!(self, Self::Float16 | Self::Float32 | Self::Float64)
538    }
539}