datafusion_common/types/native.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use super::{
19 LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields,
20 TypeSignature,
21};
22use crate::error::{_internal_err, Result};
23use arrow::compute::can_cast_types;
24use arrow::datatypes::{
25 DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType,
26 Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
27};
28use std::{fmt::Display, sync::Arc};
29
30/// Representation of a type that DataFusion can handle natively. It is a subset
31/// of the physical variants in Arrow's native [`DataType`].
32#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
33pub enum NativeType {
34 /// Null type
35 Null,
36 /// A boolean type representing the values `true` and `false`.
37 Boolean,
38 /// A signed 8-bit integer.
39 Int8,
40 /// A signed 16-bit integer.
41 Int16,
42 /// A signed 32-bit integer.
43 Int32,
44 /// A signed 64-bit integer.
45 Int64,
46 /// An unsigned 8-bit integer.
47 UInt8,
48 /// An unsigned 16-bit integer.
49 UInt16,
50 /// An unsigned 32-bit integer.
51 UInt32,
52 /// An unsigned 64-bit integer.
53 UInt64,
54 /// A 16-bit floating point number.
55 Float16,
56 /// A 32-bit floating point number.
57 Float32,
58 /// A 64-bit floating point number.
59 Float64,
60 /// A timestamp with an optional timezone.
61 ///
62 /// Time is measured as a Unix epoch, counting the seconds from
63 /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
64 /// as a signed 64-bit integer.
65 ///
66 /// The time zone is a string indicating the name of a time zone, one of:
67 ///
68 /// * As used in the Olson time zone database (the "tz database" or
69 /// "tzdata"), such as "America/New_York"
70 /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
71 ///
72 /// Timestamps with a non-empty timezone
73 /// ------------------------------------
74 ///
75 /// If a Timestamp column has a non-empty timezone value, its epoch is
76 /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
77 /// (the Unix epoch), regardless of the Timestamp's own timezone.
78 ///
79 /// Therefore, timestamp values with a non-empty timezone correspond to
80 /// physical points in time together with some additional information about
81 /// how the data was obtained and/or how to display it (the timezone).
82 ///
83 /// For example, the timestamp value 0 with the timezone string "Europe/Paris"
84 /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
85 /// application may prefer to display it as "January 1st 1970, 01h00" in
86 /// the Europe/Paris timezone (which is the same physical point in time).
87 ///
88 /// One consequence is that timestamp values with a non-empty timezone
89 /// can be compared and ordered directly, since they all share the same
90 /// well-known point of reference (the Unix epoch).
91 ///
92 /// Timestamps with an unset / empty timezone
93 /// -----------------------------------------
94 ///
95 /// If a Timestamp column has no timezone value, its epoch is
96 /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
97 ///
98 /// Therefore, timestamp values without a timezone cannot be meaningfully
99 /// interpreted as physical points in time, but only as calendar / clock
100 /// indications ("wall clock time") in an unspecified timezone.
101 ///
102 /// For example, the timestamp value 0 with an empty timezone string
103 /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
104 /// is not enough information to interpret it as a well-defined physical
105 /// point in time.
106 ///
107 /// One consequence is that timestamp values without a timezone cannot
108 /// be reliably compared or ordered, since they may have different points of
109 /// reference. In particular, it is *not* possible to interpret an unset
110 /// or empty timezone as the same as "UTC".
111 ///
112 /// Conversion between timezones
113 /// ----------------------------
114 ///
115 /// If a Timestamp column has a non-empty timezone, changing the timezone
116 /// to a different non-empty value is a metadata-only operation:
117 /// the timestamp values need not change as their point of reference remains
118 /// the same (the Unix epoch).
119 ///
120 /// However, if a Timestamp column has no timezone value, changing it to a
121 /// non-empty value requires to think about the desired semantics.
122 /// One possibility is to assume that the original timestamp values are
123 /// relative to the epoch of the timezone being set; timestamp values should
124 /// then adjusted to the Unix epoch (for example, changing the timezone from
125 /// empty to "Europe/Paris" would require converting the timestamp values
126 /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
127 /// nevertheless correct).
128 ///
129 /// ```
130 /// # use arrow::datatypes::{DataType, TimeUnit};
131 /// DataType::Timestamp(TimeUnit::Second, None);
132 /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
133 /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
134 /// ```
135 Timestamp(TimeUnit, Option<Arc<str>>),
136 /// A signed date representing the elapsed time since UNIX epoch (1970-01-01)
137 /// in days.
138 Date,
139 /// A signed time representing the elapsed time since midnight in the unit of `TimeUnit`.
140 Time(TimeUnit),
141 /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
142 Duration(TimeUnit),
143 /// A "calendar" interval which models types that don't necessarily
144 /// have a precise duration without the context of a base timestamp (e.g.
145 /// days can differ in length during day light savings time transitions).
146 Interval(IntervalUnit),
147 /// Opaque binary data of variable length.
148 Binary,
149 /// Opaque binary data of fixed size.
150 /// Enum parameter specifies the number of bytes per value.
151 FixedSizeBinary(i32),
152 /// A variable-length string in Unicode with UTF-8 encoding.
153 String,
154 /// A list of some logical data type with variable length.
155 List(LogicalFieldRef),
156 /// A list of some logical data type with fixed length.
157 FixedSizeList(LogicalFieldRef, i32),
158 /// A nested type that contains a number of sub-fields.
159 Struct(LogicalFields),
160 /// A nested type that can represent slots of differing types.
161 Union(LogicalUnionFields),
162 /// Decimal value with precision and scale
163 ///
164 /// * precision is the total number of digits
165 /// * scale is the number of digits past the decimal
166 ///
167 /// For example the number 123.45 has precision 5 and scale 2.
168 ///
169 /// In certain situations, scale could be negative number. For
170 /// negative scale, it is the number of padding 0 to the right
171 /// of the digits.
172 ///
173 /// For example the number 12300 could be treated as a decimal
174 /// has precision 3 and scale -2.
175 Decimal(u8, i8),
176 /// A Map is a type that an association between a key and a value.
177 ///
178 /// The key and value types are not constrained, but keys should be
179 /// hashable and unique.
180 ///
181 /// In a field with Map type, key type and the second the value type. The names of the
182 /// child fields may be respectively "entries", "key", and "value", but this is
183 /// not enforced.
184 Map(LogicalFieldRef),
185}
186
187impl Display for NativeType {
188 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
189 // Match the format used by arrow::datatypes::DataType's Display impl
190 match self {
191 Self::Null => write!(f, "Null"),
192 Self::Boolean => write!(f, "Boolean"),
193 Self::Int8 => write!(f, "Int8"),
194 Self::Int16 => write!(f, "Int16"),
195 Self::Int32 => write!(f, "Int32"),
196 Self::Int64 => write!(f, "Int64"),
197 Self::UInt8 => write!(f, "UInt8"),
198 Self::UInt16 => write!(f, "UInt16"),
199 Self::UInt32 => write!(f, "UInt32"),
200 Self::UInt64 => write!(f, "UInt64"),
201 Self::Float16 => write!(f, "Float16"),
202 Self::Float32 => write!(f, "Float32"),
203 Self::Float64 => write!(f, "Float64"),
204 Self::Timestamp(unit, Some(tz)) => write!(f, "Timestamp({unit}, {tz:?})"),
205 Self::Timestamp(unit, None) => write!(f, "Timestamp({unit})"),
206 Self::Date => write!(f, "Date"),
207 Self::Time(unit) => write!(f, "Time({unit})"),
208 Self::Duration(unit) => write!(f, "Duration({unit})"),
209 Self::Interval(unit) => write!(f, "Interval({unit:?})"),
210 Self::Binary => write!(f, "Binary"),
211 Self::FixedSizeBinary(size) => write!(f, "FixedSizeBinary({size})"),
212 Self::String => write!(f, "String"),
213 Self::List(field) => write!(f, "List({})", field.logical_type),
214 Self::FixedSizeList(field, size) => {
215 write!(f, "FixedSizeList({size} x {})", field.logical_type)
216 }
217 Self::Struct(fields) => {
218 write!(f, "Struct(")?;
219 for (i, field) in fields.iter().enumerate() {
220 if i > 0 {
221 write!(f, ", ")?;
222 }
223 write!(f, "{:?}: {}", field.name, field.logical_type)?;
224 }
225 write!(f, ")")
226 }
227 Self::Union(fields) => {
228 write!(f, "Union(")?;
229 for (i, (type_id, field)) in fields.iter().enumerate() {
230 if i > 0 {
231 write!(f, ", ")?;
232 }
233 write!(f, "{type_id}: ({:?}: {})", field.name, field.logical_type)?;
234 }
235 write!(f, ")")
236 }
237 Self::Decimal(precision, scale) => write!(f, "Decimal({precision}, {scale})"),
238 Self::Map(field) => write!(f, "Map({})", field.logical_type),
239 }
240 }
241}
242
243impl LogicalType for NativeType {
244 fn native(&self) -> &NativeType {
245 self
246 }
247
248 fn signature(&self) -> TypeSignature<'_> {
249 TypeSignature::Native(self)
250 }
251
252 /// Returns the default casted type for the given arrow type
253 ///
254 /// For types like String or Date, multiple arrow types mapped to the same logical type
255 /// If the given arrow type is one of them, we return the same type
256 /// Otherwise, we define the default casted type for the given arrow type
257 fn default_cast_for(&self, origin: &DataType) -> Result<DataType> {
258 use DataType::*;
259
260 fn default_field_cast(to: &LogicalField, from: &Field) -> Result<FieldRef> {
261 Ok(Arc::new(Field::new(
262 to.name.clone(),
263 to.logical_type.default_cast_for(from.data_type())?,
264 to.nullable,
265 )))
266 }
267
268 Ok(match (self, origin) {
269 (Self::Null, _) => Null,
270 (Self::Boolean, _) => Boolean,
271 (Self::Int8, _) => Int8,
272 (Self::Int16, _) => Int16,
273 (Self::Int32, _) => Int32,
274 (Self::Int64, _) => Int64,
275 (Self::UInt8, _) => UInt8,
276 (Self::UInt16, _) => UInt16,
277 (Self::UInt32, _) => UInt32,
278 (Self::UInt64, _) => UInt64,
279 (Self::Float16, _) => Float16,
280 (Self::Float32, _) => Float32,
281 (Self::Float64, _) => Float64,
282 (Self::Decimal(p, s), _) if *p <= DECIMAL32_MAX_PRECISION => {
283 Decimal32(*p, *s)
284 }
285 (Self::Decimal(p, s), _) if *p <= DECIMAL64_MAX_PRECISION => {
286 Decimal64(*p, *s)
287 }
288 (Self::Decimal(p, s), _) if *p <= DECIMAL128_MAX_PRECISION => {
289 Decimal128(*p, *s)
290 }
291 (Self::Decimal(p, s), _) => Decimal256(*p, *s),
292 (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
293 // If given type is Date, return the same type
294 (Self::Date, Date32 | Date64) => origin.to_owned(),
295 (Self::Date, _) => Date32,
296 (Self::Time(tu), _) => match tu {
297 TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
298 TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu),
299 },
300 (Self::Duration(tu), _) => Duration(*tu),
301 (Self::Interval(iu), _) => Interval(*iu),
302 (Self::Binary, LargeUtf8) => LargeBinary,
303 (Self::Binary, Utf8View) => BinaryView,
304 // We don't cast to another kind of binary type if the origin one is already a binary type
305 (Self::Binary, Binary | LargeBinary | BinaryView) => origin.to_owned(),
306 (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => {
307 BinaryView
308 }
309 (Self::Binary, data_type) if can_cast_types(data_type, &LargeBinary) => {
310 LargeBinary
311 }
312 (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary,
313 (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size),
314 (Self::String, LargeBinary) => LargeUtf8,
315 (Self::String, BinaryView) => Utf8View,
316 // We don't cast to another kind of string type if the origin one is already a string type
317 (Self::String, Utf8 | LargeUtf8 | Utf8View) => origin.to_owned(),
318 (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View,
319 (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => {
320 LargeUtf8
321 }
322 (Self::String, data_type) if can_cast_types(data_type, &Utf8) => Utf8,
323 (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => {
324 List(default_field_cast(to_field, from_field)?)
325 }
326 (Self::List(to_field), LargeList(from_field)) => {
327 LargeList(default_field_cast(to_field, from_field)?)
328 }
329 (Self::List(to_field), ListView(from_field)) => {
330 ListView(default_field_cast(to_field, from_field)?)
331 }
332 (Self::List(to_field), LargeListView(from_field)) => {
333 LargeListView(default_field_cast(to_field, from_field)?)
334 }
335 // List array where each element is a len 1 list of the origin type
336 (Self::List(field), _) => List(Arc::new(Field::new(
337 field.name.clone(),
338 field.logical_type.default_cast_for(origin)?,
339 field.nullable,
340 ))),
341 (
342 Self::FixedSizeList(to_field, to_size),
343 FixedSizeList(from_field, from_size),
344 ) if from_size == to_size => {
345 FixedSizeList(default_field_cast(to_field, from_field)?, *to_size)
346 }
347 (
348 Self::FixedSizeList(to_field, size),
349 List(from_field)
350 | LargeList(from_field)
351 | ListView(from_field)
352 | LargeListView(from_field),
353 ) => FixedSizeList(default_field_cast(to_field, from_field)?, *size),
354 // FixedSizeList array where each element is a len 1 list of the origin type
355 (Self::FixedSizeList(field, size), _) => FixedSizeList(
356 Arc::new(Field::new(
357 field.name.clone(),
358 field.logical_type.default_cast_for(origin)?,
359 field.nullable,
360 )),
361 *size,
362 ),
363 // From https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196
364 (Self::Struct(to_fields), Struct(from_fields))
365 if from_fields.len() == to_fields.len() =>
366 {
367 Struct(
368 from_fields
369 .iter()
370 .zip(to_fields.iter())
371 .map(|(from, to)| default_field_cast(to, from))
372 .collect::<Result<Fields>>()?,
373 )
374 }
375 (Self::Struct(to_fields), Null) => Struct(
376 to_fields
377 .iter()
378 .map(|field| {
379 Ok(Arc::new(Field::new(
380 field.name.clone(),
381 field.logical_type.default_cast_for(&Null)?,
382 field.nullable,
383 )))
384 })
385 .collect::<Result<Fields>>()?,
386 ),
387 (Self::Map(to_field), Map(from_field, sorted)) => {
388 Map(default_field_cast(to_field, from_field)?, *sorted)
389 }
390 (Self::Map(field), Null) => Map(
391 Arc::new(Field::new(
392 field.name.clone(),
393 field.logical_type.default_cast_for(&Null)?,
394 field.nullable,
395 )),
396 false,
397 ),
398 (Self::Union(to_fields), Union(from_fields, mode))
399 if from_fields.len() == to_fields.len() =>
400 {
401 Union(
402 from_fields
403 .iter()
404 .zip(to_fields.iter())
405 .map(|((_, from), (i, to))| {
406 Ok((*i, default_field_cast(to, from)?))
407 })
408 .collect::<Result<UnionFields>>()?,
409 *mode,
410 )
411 }
412 _ => {
413 return _internal_err!(
414 "Unavailable default cast for native type {} from physical type {}",
415 self,
416 origin
417 );
418 }
419 })
420 }
421}
422
423// The following From<DataType>, From<Field>, ... implementations are temporary
424// mapping solutions to provide backwards compatibility while transitioning from
425// the purely physical system to a logical / physical system.
426
427impl From<&DataType> for NativeType {
428 fn from(value: &DataType) -> Self {
429 value.clone().into()
430 }
431}
432
433impl From<DataType> for NativeType {
434 fn from(value: DataType) -> Self {
435 use NativeType::*;
436 match value {
437 DataType::Null => Null,
438 DataType::Boolean => Boolean,
439 DataType::Int8 => Int8,
440 DataType::Int16 => Int16,
441 DataType::Int32 => Int32,
442 DataType::Int64 => Int64,
443 DataType::UInt8 => UInt8,
444 DataType::UInt16 => UInt16,
445 DataType::UInt32 => UInt32,
446 DataType::UInt64 => UInt64,
447 DataType::Float16 => Float16,
448 DataType::Float32 => Float32,
449 DataType::Float64 => Float64,
450 DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
451 DataType::Date32 | DataType::Date64 => Date,
452 DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
453 DataType::Duration(tu) => Duration(tu),
454 DataType::Interval(iu) => Interval(iu),
455 DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary,
456 DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
457 DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => String,
458 DataType::List(field)
459 | DataType::ListView(field)
460 | DataType::LargeList(field)
461 | DataType::LargeListView(field) => List(Arc::new(field.as_ref().into())),
462 DataType::FixedSizeList(field, size) => {
463 FixedSizeList(Arc::new(field.as_ref().into()), size)
464 }
465 DataType::Struct(fields) => Struct(LogicalFields::from(&fields)),
466 DataType::Union(union_fields, _) => {
467 Union(LogicalUnionFields::from(&union_fields))
468 }
469 DataType::Decimal32(p, s)
470 | DataType::Decimal64(p, s)
471 | DataType::Decimal128(p, s)
472 | DataType::Decimal256(p, s) => Decimal(p, s),
473 DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
474 DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
475 DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),
476 }
477 }
478}
479
480impl NativeType {
481 #[inline]
482 pub fn is_numeric(&self) -> bool {
483 self.is_integer() || self.is_float() || self.is_decimal()
484 }
485
486 #[inline]
487 pub fn is_integer(&self) -> bool {
488 use NativeType::*;
489 matches!(
490 self,
491 UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
492 )
493 }
494
495 #[inline]
496 pub fn is_timestamp(&self) -> bool {
497 matches!(self, NativeType::Timestamp(_, _))
498 }
499
500 #[inline]
501 pub fn is_date(&self) -> bool {
502 *self == NativeType::Date
503 }
504
505 #[inline]
506 pub fn is_time(&self) -> bool {
507 matches!(self, NativeType::Time(_))
508 }
509
510 #[inline]
511 pub fn is_interval(&self) -> bool {
512 matches!(self, NativeType::Interval(_))
513 }
514
515 #[inline]
516 pub fn is_duration(&self) -> bool {
517 matches!(self, NativeType::Duration(_))
518 }
519
520 #[inline]
521 pub fn is_binary(&self) -> bool {
522 matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_))
523 }
524
525 #[inline]
526 pub fn is_null(&self) -> bool {
527 *self == NativeType::Null
528 }
529
530 #[inline]
531 pub fn is_decimal(&self) -> bool {
532 matches!(self, Self::Decimal(_, _))
533 }
534
535 #[inline]
536 pub fn is_float(&self) -> bool {
537 matches!(self, Self::Float16 | Self::Float32 | Self::Float64)
538 }
539}