datafusion_common/types/native.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use super::{
19 LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields,
20 TypeSignature,
21};
22use crate::error::{Result, _internal_err};
23use arrow::compute::can_cast_types;
24use arrow::datatypes::{
25 DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
26 DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION,
27};
28use std::{fmt::Display, sync::Arc};
29
30/// Representation of a type that DataFusion can handle natively. It is a subset
31/// of the physical variants in Arrow's native [`DataType`].
32#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
33pub enum NativeType {
34 /// Null type
35 Null,
36 /// A boolean type representing the values `true` and `false`.
37 Boolean,
38 /// A signed 8-bit integer.
39 Int8,
40 /// A signed 16-bit integer.
41 Int16,
42 /// A signed 32-bit integer.
43 Int32,
44 /// A signed 64-bit integer.
45 Int64,
46 /// An unsigned 8-bit integer.
47 UInt8,
48 /// An unsigned 16-bit integer.
49 UInt16,
50 /// An unsigned 32-bit integer.
51 UInt32,
52 /// An unsigned 64-bit integer.
53 UInt64,
54 /// A 16-bit floating point number.
55 Float16,
56 /// A 32-bit floating point number.
57 Float32,
58 /// A 64-bit floating point number.
59 Float64,
60 /// A timestamp with an optional timezone.
61 ///
62 /// Time is measured as a Unix epoch, counting the seconds from
63 /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
64 /// as a signed 64-bit integer.
65 ///
66 /// The time zone is a string indicating the name of a time zone, one of:
67 ///
68 /// * As used in the Olson time zone database (the "tz database" or
69 /// "tzdata"), such as "America/New_York"
70 /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
71 ///
72 /// Timestamps with a non-empty timezone
73 /// ------------------------------------
74 ///
75 /// If a Timestamp column has a non-empty timezone value, its epoch is
76 /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
77 /// (the Unix epoch), regardless of the Timestamp's own timezone.
78 ///
79 /// Therefore, timestamp values with a non-empty timezone correspond to
80 /// physical points in time together with some additional information about
81 /// how the data was obtained and/or how to display it (the timezone).
82 ///
83 /// For example, the timestamp value 0 with the timezone string "Europe/Paris"
84 /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
85 /// application may prefer to display it as "January 1st 1970, 01h00" in
86 /// the Europe/Paris timezone (which is the same physical point in time).
87 ///
88 /// One consequence is that timestamp values with a non-empty timezone
89 /// can be compared and ordered directly, since they all share the same
90 /// well-known point of reference (the Unix epoch).
91 ///
92 /// Timestamps with an unset / empty timezone
93 /// -----------------------------------------
94 ///
95 /// If a Timestamp column has no timezone value, its epoch is
96 /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
97 ///
98 /// Therefore, timestamp values without a timezone cannot be meaningfully
99 /// interpreted as physical points in time, but only as calendar / clock
100 /// indications ("wall clock time") in an unspecified timezone.
101 ///
102 /// For example, the timestamp value 0 with an empty timezone string
103 /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
104 /// is not enough information to interpret it as a well-defined physical
105 /// point in time.
106 ///
107 /// One consequence is that timestamp values without a timezone cannot
108 /// be reliably compared or ordered, since they may have different points of
109 /// reference. In particular, it is *not* possible to interpret an unset
110 /// or empty timezone as the same as "UTC".
111 ///
112 /// Conversion between timezones
113 /// ----------------------------
114 ///
115 /// If a Timestamp column has a non-empty timezone, changing the timezone
116 /// to a different non-empty value is a metadata-only operation:
117 /// the timestamp values need not change as their point of reference remains
118 /// the same (the Unix epoch).
119 ///
120 /// However, if a Timestamp column has no timezone value, changing it to a
121 /// non-empty value requires to think about the desired semantics.
122 /// One possibility is to assume that the original timestamp values are
123 /// relative to the epoch of the timezone being set; timestamp values should
124 /// then adjusted to the Unix epoch (for example, changing the timezone from
125 /// empty to "Europe/Paris" would require converting the timestamp values
126 /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
127 /// nevertheless correct).
128 ///
129 /// ```
130 /// # use arrow::datatypes::{DataType, TimeUnit};
131 /// DataType::Timestamp(TimeUnit::Second, None);
132 /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
133 /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
134 /// ```
135 Timestamp(TimeUnit, Option<Arc<str>>),
136 /// A signed date representing the elapsed time since UNIX epoch (1970-01-01)
137 /// in days.
138 Date,
139 /// A signed time representing the elapsed time since midnight in the unit of `TimeUnit`.
140 Time(TimeUnit),
141 /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
142 Duration(TimeUnit),
143 /// A "calendar" interval which models types that don't necessarily
144 /// have a precise duration without the context of a base timestamp (e.g.
145 /// days can differ in length during day light savings time transitions).
146 Interval(IntervalUnit),
147 /// Opaque binary data of variable length.
148 Binary,
149 /// Opaque binary data of fixed size.
150 /// Enum parameter specifies the number of bytes per value.
151 FixedSizeBinary(i32),
152 /// A variable-length string in Unicode with UTF-8 encoding.
153 String,
154 /// A list of some logical data type with variable length.
155 List(LogicalFieldRef),
156 /// A list of some logical data type with fixed length.
157 FixedSizeList(LogicalFieldRef, i32),
158 /// A nested type that contains a number of sub-fields.
159 Struct(LogicalFields),
160 /// A nested type that can represent slots of differing types.
161 Union(LogicalUnionFields),
162 /// Decimal value with precision and scale
163 ///
164 /// * precision is the total number of digits
165 /// * scale is the number of digits past the decimal
166 ///
167 /// For example the number 123.45 has precision 5 and scale 2.
168 ///
169 /// In certain situations, scale could be negative number. For
170 /// negative scale, it is the number of padding 0 to the right
171 /// of the digits.
172 ///
173 /// For example the number 12300 could be treated as a decimal
174 /// has precision 3 and scale -2.
175 Decimal(u8, i8),
176 /// A Map is a type that an association between a key and a value.
177 ///
178 /// The key and value types are not constrained, but keys should be
179 /// hashable and unique.
180 ///
181 /// In a field with Map type, key type and the second the value type. The names of the
182 /// child fields may be respectively "entries", "key", and "value", but this is
183 /// not enforced.
184 Map(LogicalFieldRef),
185}
186
187impl Display for NativeType {
188 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
189 write!(f, "{self:?}") // TODO: nicer formatting
190 }
191}
192
193impl LogicalType for NativeType {
194 fn native(&self) -> &NativeType {
195 self
196 }
197
198 fn signature(&self) -> TypeSignature<'_> {
199 TypeSignature::Native(self)
200 }
201
202 /// Returns the default casted type for the given arrow type
203 ///
204 /// For types like String or Date, multiple arrow types mapped to the same logical type
205 /// If the given arrow type is one of them, we return the same type
206 /// Otherwise, we define the default casted type for the given arrow type
207 fn default_cast_for(&self, origin: &DataType) -> Result<DataType> {
208 use DataType::*;
209
210 fn default_field_cast(to: &LogicalField, from: &Field) -> Result<FieldRef> {
211 Ok(Arc::new(Field::new(
212 to.name.clone(),
213 to.logical_type.default_cast_for(from.data_type())?,
214 to.nullable,
215 )))
216 }
217
218 Ok(match (self, origin) {
219 (Self::Null, _) => Null,
220 (Self::Boolean, _) => Boolean,
221 (Self::Int8, _) => Int8,
222 (Self::Int16, _) => Int16,
223 (Self::Int32, _) => Int32,
224 (Self::Int64, _) => Int64,
225 (Self::UInt8, _) => UInt8,
226 (Self::UInt16, _) => UInt16,
227 (Self::UInt32, _) => UInt32,
228 (Self::UInt64, _) => UInt64,
229 (Self::Float16, _) => Float16,
230 (Self::Float32, _) => Float32,
231 (Self::Float64, _) => Float64,
232 (Self::Decimal(p, s), _) if *p <= DECIMAL32_MAX_PRECISION => {
233 Decimal32(*p, *s)
234 }
235 (Self::Decimal(p, s), _) if *p <= DECIMAL64_MAX_PRECISION => {
236 Decimal64(*p, *s)
237 }
238 (Self::Decimal(p, s), _) if *p <= DECIMAL128_MAX_PRECISION => {
239 Decimal128(*p, *s)
240 }
241 (Self::Decimal(p, s), _) => Decimal256(*p, *s),
242 (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
243 // If given type is Date, return the same type
244 (Self::Date, origin) if matches!(origin, Date32 | Date64) => {
245 origin.to_owned()
246 }
247 (Self::Date, _) => Date32,
248 (Self::Time(tu), _) => match tu {
249 TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
250 TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu),
251 },
252 (Self::Duration(tu), _) => Duration(*tu),
253 (Self::Interval(iu), _) => Interval(*iu),
254 (Self::Binary, LargeUtf8) => LargeBinary,
255 (Self::Binary, Utf8View) => BinaryView,
256 (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => {
257 BinaryView
258 }
259 (Self::Binary, data_type) if can_cast_types(data_type, &LargeBinary) => {
260 LargeBinary
261 }
262 (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary,
263 (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size),
264 (Self::String, LargeBinary) => LargeUtf8,
265 (Self::String, BinaryView) => Utf8View,
266 // We don't cast to another kind of string type if the origin one is already a string type
267 (Self::String, Utf8 | LargeUtf8 | Utf8View) => origin.to_owned(),
268 (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View,
269 (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => {
270 LargeUtf8
271 }
272 (Self::String, data_type) if can_cast_types(data_type, &Utf8) => Utf8,
273 (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => {
274 List(default_field_cast(to_field, from_field)?)
275 }
276 (Self::List(to_field), LargeList(from_field)) => {
277 LargeList(default_field_cast(to_field, from_field)?)
278 }
279 (Self::List(to_field), ListView(from_field)) => {
280 ListView(default_field_cast(to_field, from_field)?)
281 }
282 (Self::List(to_field), LargeListView(from_field)) => {
283 LargeListView(default_field_cast(to_field, from_field)?)
284 }
285 // List array where each element is a len 1 list of the origin type
286 (Self::List(field), _) => List(Arc::new(Field::new(
287 field.name.clone(),
288 field.logical_type.default_cast_for(origin)?,
289 field.nullable,
290 ))),
291 (
292 Self::FixedSizeList(to_field, to_size),
293 FixedSizeList(from_field, from_size),
294 ) if from_size == to_size => {
295 FixedSizeList(default_field_cast(to_field, from_field)?, *to_size)
296 }
297 (
298 Self::FixedSizeList(to_field, size),
299 List(from_field)
300 | LargeList(from_field)
301 | ListView(from_field)
302 | LargeListView(from_field),
303 ) => FixedSizeList(default_field_cast(to_field, from_field)?, *size),
304 // FixedSizeList array where each element is a len 1 list of the origin type
305 (Self::FixedSizeList(field, size), _) => FixedSizeList(
306 Arc::new(Field::new(
307 field.name.clone(),
308 field.logical_type.default_cast_for(origin)?,
309 field.nullable,
310 )),
311 *size,
312 ),
313 // From https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196
314 (Self::Struct(to_fields), Struct(from_fields))
315 if from_fields.len() == to_fields.len() =>
316 {
317 Struct(
318 from_fields
319 .iter()
320 .zip(to_fields.iter())
321 .map(|(from, to)| default_field_cast(to, from))
322 .collect::<Result<Fields>>()?,
323 )
324 }
325 (Self::Struct(to_fields), Null) => Struct(
326 to_fields
327 .iter()
328 .map(|field| {
329 Ok(Arc::new(Field::new(
330 field.name.clone(),
331 field.logical_type.default_cast_for(&Null)?,
332 field.nullable,
333 )))
334 })
335 .collect::<Result<Fields>>()?,
336 ),
337 (Self::Map(to_field), Map(from_field, sorted)) => {
338 Map(default_field_cast(to_field, from_field)?, *sorted)
339 }
340 (Self::Map(field), Null) => Map(
341 Arc::new(Field::new(
342 field.name.clone(),
343 field.logical_type.default_cast_for(&Null)?,
344 field.nullable,
345 )),
346 false,
347 ),
348 (Self::Union(to_fields), Union(from_fields, mode))
349 if from_fields.len() == to_fields.len() =>
350 {
351 Union(
352 from_fields
353 .iter()
354 .zip(to_fields.iter())
355 .map(|((_, from), (i, to))| {
356 Ok((*i, default_field_cast(to, from)?))
357 })
358 .collect::<Result<UnionFields>>()?,
359 *mode,
360 )
361 }
362 _ => {
363 return _internal_err!(
364 "Unavailable default cast for native type {} from physical type {}",
365 self,
366 origin
367 )
368 }
369 })
370 }
371}
372
373// The following From<DataType>, From<Field>, ... implementations are temporary
374// mapping solutions to provide backwards compatibility while transitioning from
375// the purely physical system to a logical / physical system.
376
377impl From<&DataType> for NativeType {
378 fn from(value: &DataType) -> Self {
379 value.clone().into()
380 }
381}
382
383impl From<DataType> for NativeType {
384 fn from(value: DataType) -> Self {
385 use NativeType::*;
386 match value {
387 DataType::Null => Null,
388 DataType::Boolean => Boolean,
389 DataType::Int8 => Int8,
390 DataType::Int16 => Int16,
391 DataType::Int32 => Int32,
392 DataType::Int64 => Int64,
393 DataType::UInt8 => UInt8,
394 DataType::UInt16 => UInt16,
395 DataType::UInt32 => UInt32,
396 DataType::UInt64 => UInt64,
397 DataType::Float16 => Float16,
398 DataType::Float32 => Float32,
399 DataType::Float64 => Float64,
400 DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
401 DataType::Date32 | DataType::Date64 => Date,
402 DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
403 DataType::Duration(tu) => Duration(tu),
404 DataType::Interval(iu) => Interval(iu),
405 DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary,
406 DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
407 DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => String,
408 DataType::List(field)
409 | DataType::ListView(field)
410 | DataType::LargeList(field)
411 | DataType::LargeListView(field) => List(Arc::new(field.as_ref().into())),
412 DataType::FixedSizeList(field, size) => {
413 FixedSizeList(Arc::new(field.as_ref().into()), size)
414 }
415 DataType::Struct(fields) => Struct(LogicalFields::from(&fields)),
416 DataType::Union(union_fields, _) => {
417 Union(LogicalUnionFields::from(&union_fields))
418 }
419 DataType::Decimal32(p, s)
420 | DataType::Decimal64(p, s)
421 | DataType::Decimal128(p, s)
422 | DataType::Decimal256(p, s) => Decimal(p, s),
423 DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
424 DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
425 DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),
426 }
427 }
428}
429
430impl NativeType {
431 #[inline]
432 pub fn is_numeric(&self) -> bool {
433 self.is_integer() || self.is_float() || self.is_decimal()
434 }
435
436 #[inline]
437 pub fn is_integer(&self) -> bool {
438 use NativeType::*;
439 matches!(
440 self,
441 UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
442 )
443 }
444
445 #[inline]
446 pub fn is_timestamp(&self) -> bool {
447 matches!(self, NativeType::Timestamp(_, _))
448 }
449
450 #[inline]
451 pub fn is_date(&self) -> bool {
452 matches!(self, NativeType::Date)
453 }
454
455 #[inline]
456 pub fn is_time(&self) -> bool {
457 matches!(self, NativeType::Time(_))
458 }
459
460 #[inline]
461 pub fn is_interval(&self) -> bool {
462 matches!(self, NativeType::Interval(_))
463 }
464
465 #[inline]
466 pub fn is_duration(&self) -> bool {
467 matches!(self, NativeType::Duration(_))
468 }
469
470 #[inline]
471 pub fn is_binary(&self) -> bool {
472 matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_))
473 }
474
475 #[inline]
476 pub fn is_null(&self) -> bool {
477 matches!(self, NativeType::Null)
478 }
479
480 #[inline]
481 pub fn is_decimal(&self) -> bool {
482 matches!(self, Self::Decimal(_, _))
483 }
484
485 #[inline]
486 pub fn is_float(&self) -> bool {
487 matches!(self, Self::Float16 | Self::Float32 | Self::Float64)
488 }
489}