datafusion_common/types/
native.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use super::{
19    LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields,
20    TypeSignature,
21};
22use crate::error::{Result, _internal_err};
23use arrow::compute::can_cast_types;
24use arrow::datatypes::{
25    DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
26    DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION,
27};
28use std::{fmt::Display, sync::Arc};
29
30/// Representation of a type that DataFusion can handle natively. It is a subset
31/// of the physical variants in Arrow's native [`DataType`].
32#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
33pub enum NativeType {
34    /// Null type
35    Null,
36    /// A boolean type representing the values `true` and `false`.
37    Boolean,
38    /// A signed 8-bit integer.
39    Int8,
40    /// A signed 16-bit integer.
41    Int16,
42    /// A signed 32-bit integer.
43    Int32,
44    /// A signed 64-bit integer.
45    Int64,
46    /// An unsigned 8-bit integer.
47    UInt8,
48    /// An unsigned 16-bit integer.
49    UInt16,
50    /// An unsigned 32-bit integer.
51    UInt32,
52    /// An unsigned 64-bit integer.
53    UInt64,
54    /// A 16-bit floating point number.
55    Float16,
56    /// A 32-bit floating point number.
57    Float32,
58    /// A 64-bit floating point number.
59    Float64,
60    /// A timestamp with an optional timezone.
61    ///
62    /// Time is measured as a Unix epoch, counting the seconds from
63    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
64    /// as a signed 64-bit integer.
65    ///
66    /// The time zone is a string indicating the name of a time zone, one of:
67    ///
68    /// * As used in the Olson time zone database (the "tz database" or
69    ///   "tzdata"), such as "America/New_York"
70    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
71    ///
72    /// Timestamps with a non-empty timezone
73    /// ------------------------------------
74    ///
75    /// If a Timestamp column has a non-empty timezone value, its epoch is
76    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
77    /// (the Unix epoch), regardless of the Timestamp's own timezone.
78    ///
79    /// Therefore, timestamp values with a non-empty timezone correspond to
80    /// physical points in time together with some additional information about
81    /// how the data was obtained and/or how to display it (the timezone).
82    ///
83    ///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
84    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
85    ///   application may prefer to display it as "January 1st 1970, 01h00" in
86    ///   the Europe/Paris timezone (which is the same physical point in time).
87    ///
88    /// One consequence is that timestamp values with a non-empty timezone
89    /// can be compared and ordered directly, since they all share the same
90    /// well-known point of reference (the Unix epoch).
91    ///
92    /// Timestamps with an unset / empty timezone
93    /// -----------------------------------------
94    ///
95    /// If a Timestamp column has no timezone value, its epoch is
96    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
97    ///
98    /// Therefore, timestamp values without a timezone cannot be meaningfully
99    /// interpreted as physical points in time, but only as calendar / clock
100    /// indications ("wall clock time") in an unspecified timezone.
101    ///
102    ///   For example, the timestamp value 0 with an empty timezone string
103    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
104    ///   is not enough information to interpret it as a well-defined physical
105    ///   point in time.
106    ///
107    /// One consequence is that timestamp values without a timezone cannot
108    /// be reliably compared or ordered, since they may have different points of
109    /// reference.  In particular, it is *not* possible to interpret an unset
110    /// or empty timezone as the same as "UTC".
111    ///
112    /// Conversion between timezones
113    /// ----------------------------
114    ///
115    /// If a Timestamp column has a non-empty timezone, changing the timezone
116    /// to a different non-empty value is a metadata-only operation:
117    /// the timestamp values need not change as their point of reference remains
118    /// the same (the Unix epoch).
119    ///
120    /// However, if a Timestamp column has no timezone value, changing it to a
121    /// non-empty value requires to think about the desired semantics.
122    /// One possibility is to assume that the original timestamp values are
123    /// relative to the epoch of the timezone being set; timestamp values should
124    /// then adjusted to the Unix epoch (for example, changing the timezone from
125    /// empty to "Europe/Paris" would require converting the timestamp values
126    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
127    /// nevertheless correct).
128    ///
129    /// ```
130    /// # use arrow::datatypes::{DataType, TimeUnit};
131    /// DataType::Timestamp(TimeUnit::Second, None);
132    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
133    /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
134    /// ```
135    Timestamp(TimeUnit, Option<Arc<str>>),
136    /// A signed date representing the elapsed time since UNIX epoch (1970-01-01)
137    /// in days.
138    Date,
139    /// A signed time representing the elapsed time since midnight in the unit of `TimeUnit`.
140    Time(TimeUnit),
141    /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
142    Duration(TimeUnit),
143    /// A "calendar" interval which models types that don't necessarily
144    /// have a precise duration without the context of a base timestamp (e.g.
145    /// days can differ in length during day light savings time transitions).
146    Interval(IntervalUnit),
147    /// Opaque binary data of variable length.
148    Binary,
149    /// Opaque binary data of fixed size.
150    /// Enum parameter specifies the number of bytes per value.
151    FixedSizeBinary(i32),
152    /// A variable-length string in Unicode with UTF-8 encoding.
153    String,
154    /// A list of some logical data type with variable length.
155    List(LogicalFieldRef),
156    /// A list of some logical data type with fixed length.
157    FixedSizeList(LogicalFieldRef, i32),
158    /// A nested type that contains a number of sub-fields.
159    Struct(LogicalFields),
160    /// A nested type that can represent slots of differing types.
161    Union(LogicalUnionFields),
162    /// Decimal value with precision and scale
163    ///
164    /// * precision is the total number of digits
165    /// * scale is the number of digits past the decimal
166    ///
167    /// For example the number 123.45 has precision 5 and scale 2.
168    ///
169    /// In certain situations, scale could be negative number. For
170    /// negative scale, it is the number of padding 0 to the right
171    /// of the digits.
172    ///
173    /// For example the number 12300 could be treated as a decimal
174    /// has precision 3 and scale -2.
175    Decimal(u8, i8),
176    /// A Map is a type that an association between a key and a value.
177    ///
178    /// The key and value types are not constrained, but keys should be
179    /// hashable and unique.
180    ///
181    /// In a field with Map type, key type and the second the value type. The names of the
182    /// child fields may be respectively "entries", "key", and "value", but this is
183    /// not enforced.
184    Map(LogicalFieldRef),
185}
186
187impl Display for NativeType {
188    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
189        write!(f, "{self:?}") // TODO: nicer formatting
190    }
191}
192
193impl LogicalType for NativeType {
194    fn native(&self) -> &NativeType {
195        self
196    }
197
198    fn signature(&self) -> TypeSignature<'_> {
199        TypeSignature::Native(self)
200    }
201
202    /// Returns the default casted type for the given arrow type
203    ///
204    /// For types like String or Date, multiple arrow types mapped to the same logical type
205    /// If the given arrow type is one of them, we return the same type
206    /// Otherwise, we define the default casted type for the given arrow type
207    fn default_cast_for(&self, origin: &DataType) -> Result<DataType> {
208        use DataType::*;
209
210        fn default_field_cast(to: &LogicalField, from: &Field) -> Result<FieldRef> {
211            Ok(Arc::new(Field::new(
212                to.name.clone(),
213                to.logical_type.default_cast_for(from.data_type())?,
214                to.nullable,
215            )))
216        }
217
218        Ok(match (self, origin) {
219            (Self::Null, _) => Null,
220            (Self::Boolean, _) => Boolean,
221            (Self::Int8, _) => Int8,
222            (Self::Int16, _) => Int16,
223            (Self::Int32, _) => Int32,
224            (Self::Int64, _) => Int64,
225            (Self::UInt8, _) => UInt8,
226            (Self::UInt16, _) => UInt16,
227            (Self::UInt32, _) => UInt32,
228            (Self::UInt64, _) => UInt64,
229            (Self::Float16, _) => Float16,
230            (Self::Float32, _) => Float32,
231            (Self::Float64, _) => Float64,
232            (Self::Decimal(p, s), _) if *p <= DECIMAL32_MAX_PRECISION => {
233                Decimal32(*p, *s)
234            }
235            (Self::Decimal(p, s), _) if *p <= DECIMAL64_MAX_PRECISION => {
236                Decimal64(*p, *s)
237            }
238            (Self::Decimal(p, s), _) if *p <= DECIMAL128_MAX_PRECISION => {
239                Decimal128(*p, *s)
240            }
241            (Self::Decimal(p, s), _) => Decimal256(*p, *s),
242            (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
243            // If given type is Date, return the same type
244            (Self::Date, origin) if matches!(origin, Date32 | Date64) => {
245                origin.to_owned()
246            }
247            (Self::Date, _) => Date32,
248            (Self::Time(tu), _) => match tu {
249                TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
250                TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu),
251            },
252            (Self::Duration(tu), _) => Duration(*tu),
253            (Self::Interval(iu), _) => Interval(*iu),
254            (Self::Binary, LargeUtf8) => LargeBinary,
255            (Self::Binary, Utf8View) => BinaryView,
256            (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => {
257                BinaryView
258            }
259            (Self::Binary, data_type) if can_cast_types(data_type, &LargeBinary) => {
260                LargeBinary
261            }
262            (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary,
263            (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size),
264            (Self::String, LargeBinary) => LargeUtf8,
265            (Self::String, BinaryView) => Utf8View,
266            // We don't cast to another kind of string type if the origin one is already a string type
267            (Self::String, Utf8 | LargeUtf8 | Utf8View) => origin.to_owned(),
268            (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View,
269            (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => {
270                LargeUtf8
271            }
272            (Self::String, data_type) if can_cast_types(data_type, &Utf8) => Utf8,
273            (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => {
274                List(default_field_cast(to_field, from_field)?)
275            }
276            (Self::List(to_field), LargeList(from_field)) => {
277                LargeList(default_field_cast(to_field, from_field)?)
278            }
279            (Self::List(to_field), ListView(from_field)) => {
280                ListView(default_field_cast(to_field, from_field)?)
281            }
282            (Self::List(to_field), LargeListView(from_field)) => {
283                LargeListView(default_field_cast(to_field, from_field)?)
284            }
285            // List array where each element is a len 1 list of the origin type
286            (Self::List(field), _) => List(Arc::new(Field::new(
287                field.name.clone(),
288                field.logical_type.default_cast_for(origin)?,
289                field.nullable,
290            ))),
291            (
292                Self::FixedSizeList(to_field, to_size),
293                FixedSizeList(from_field, from_size),
294            ) if from_size == to_size => {
295                FixedSizeList(default_field_cast(to_field, from_field)?, *to_size)
296            }
297            (
298                Self::FixedSizeList(to_field, size),
299                List(from_field)
300                | LargeList(from_field)
301                | ListView(from_field)
302                | LargeListView(from_field),
303            ) => FixedSizeList(default_field_cast(to_field, from_field)?, *size),
304            // FixedSizeList array where each element is a len 1 list of the origin type
305            (Self::FixedSizeList(field, size), _) => FixedSizeList(
306                Arc::new(Field::new(
307                    field.name.clone(),
308                    field.logical_type.default_cast_for(origin)?,
309                    field.nullable,
310                )),
311                *size,
312            ),
313            // From https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196
314            (Self::Struct(to_fields), Struct(from_fields))
315                if from_fields.len() == to_fields.len() =>
316            {
317                Struct(
318                    from_fields
319                        .iter()
320                        .zip(to_fields.iter())
321                        .map(|(from, to)| default_field_cast(to, from))
322                        .collect::<Result<Fields>>()?,
323                )
324            }
325            (Self::Struct(to_fields), Null) => Struct(
326                to_fields
327                    .iter()
328                    .map(|field| {
329                        Ok(Arc::new(Field::new(
330                            field.name.clone(),
331                            field.logical_type.default_cast_for(&Null)?,
332                            field.nullable,
333                        )))
334                    })
335                    .collect::<Result<Fields>>()?,
336            ),
337            (Self::Map(to_field), Map(from_field, sorted)) => {
338                Map(default_field_cast(to_field, from_field)?, *sorted)
339            }
340            (Self::Map(field), Null) => Map(
341                Arc::new(Field::new(
342                    field.name.clone(),
343                    field.logical_type.default_cast_for(&Null)?,
344                    field.nullable,
345                )),
346                false,
347            ),
348            (Self::Union(to_fields), Union(from_fields, mode))
349                if from_fields.len() == to_fields.len() =>
350            {
351                Union(
352                    from_fields
353                        .iter()
354                        .zip(to_fields.iter())
355                        .map(|((_, from), (i, to))| {
356                            Ok((*i, default_field_cast(to, from)?))
357                        })
358                        .collect::<Result<UnionFields>>()?,
359                    *mode,
360                )
361            }
362            _ => {
363                return _internal_err!(
364                    "Unavailable default cast for native type {} from physical type {}",
365                    self,
366                    origin
367                )
368            }
369        })
370    }
371}
372
373// The following From<DataType>, From<Field>, ... implementations are temporary
374// mapping solutions to provide backwards compatibility while transitioning from
375// the purely physical system to a logical / physical system.
376
377impl From<&DataType> for NativeType {
378    fn from(value: &DataType) -> Self {
379        value.clone().into()
380    }
381}
382
383impl From<DataType> for NativeType {
384    fn from(value: DataType) -> Self {
385        use NativeType::*;
386        match value {
387            DataType::Null => Null,
388            DataType::Boolean => Boolean,
389            DataType::Int8 => Int8,
390            DataType::Int16 => Int16,
391            DataType::Int32 => Int32,
392            DataType::Int64 => Int64,
393            DataType::UInt8 => UInt8,
394            DataType::UInt16 => UInt16,
395            DataType::UInt32 => UInt32,
396            DataType::UInt64 => UInt64,
397            DataType::Float16 => Float16,
398            DataType::Float32 => Float32,
399            DataType::Float64 => Float64,
400            DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
401            DataType::Date32 | DataType::Date64 => Date,
402            DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
403            DataType::Duration(tu) => Duration(tu),
404            DataType::Interval(iu) => Interval(iu),
405            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary,
406            DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
407            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => String,
408            DataType::List(field)
409            | DataType::ListView(field)
410            | DataType::LargeList(field)
411            | DataType::LargeListView(field) => List(Arc::new(field.as_ref().into())),
412            DataType::FixedSizeList(field, size) => {
413                FixedSizeList(Arc::new(field.as_ref().into()), size)
414            }
415            DataType::Struct(fields) => Struct(LogicalFields::from(&fields)),
416            DataType::Union(union_fields, _) => {
417                Union(LogicalUnionFields::from(&union_fields))
418            }
419            DataType::Decimal32(p, s)
420            | DataType::Decimal64(p, s)
421            | DataType::Decimal128(p, s)
422            | DataType::Decimal256(p, s) => Decimal(p, s),
423            DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
424            DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
425            DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),
426        }
427    }
428}
429
430impl NativeType {
431    #[inline]
432    pub fn is_numeric(&self) -> bool {
433        self.is_integer() || self.is_float() || self.is_decimal()
434    }
435
436    #[inline]
437    pub fn is_integer(&self) -> bool {
438        use NativeType::*;
439        matches!(
440            self,
441            UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
442        )
443    }
444
445    #[inline]
446    pub fn is_timestamp(&self) -> bool {
447        matches!(self, NativeType::Timestamp(_, _))
448    }
449
450    #[inline]
451    pub fn is_date(&self) -> bool {
452        matches!(self, NativeType::Date)
453    }
454
455    #[inline]
456    pub fn is_time(&self) -> bool {
457        matches!(self, NativeType::Time(_))
458    }
459
460    #[inline]
461    pub fn is_interval(&self) -> bool {
462        matches!(self, NativeType::Interval(_))
463    }
464
465    #[inline]
466    pub fn is_duration(&self) -> bool {
467        matches!(self, NativeType::Duration(_))
468    }
469
470    #[inline]
471    pub fn is_binary(&self) -> bool {
472        matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_))
473    }
474
475    #[inline]
476    pub fn is_null(&self) -> bool {
477        matches!(self, NativeType::Null)
478    }
479
480    #[inline]
481    pub fn is_decimal(&self) -> bool {
482        matches!(self, Self::Decimal(_, _))
483    }
484
485    #[inline]
486    pub fn is_float(&self) -> bool {
487        matches!(self, Self::Float16 | Self::Float32 | Self::Float64)
488    }
489}