1use std::collections::BTreeMap;
2
3use arrow::datatypes::{
4 DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, MAINTAIN_PL_TYPE,
5 Metadata, PL_KEY,
6};
7#[cfg(feature = "dtype-array")]
8use polars_utils::format_tuple;
9use polars_utils::itertools::Itertools;
10#[cfg(any(feature = "serde-lazy", feature = "serde"))]
11use serde::{Deserialize, Serialize};
12pub use temporal::time_zone::TimeZone;
13
14use super::*;
15#[cfg(feature = "object")]
16use crate::chunked_array::object::registry::get_object_physical_type;
17use crate::utils::materialize_dyn_int;
18
19pub trait MetaDataExt: IntoMetadata {
20 fn pl_enum_metadata(&self) -> Option<&str> {
21 let md = self.into_metadata_ref();
22 let values = md
23 .get(DTYPE_ENUM_VALUES_NEW)
24 .or_else(|| md.get(DTYPE_ENUM_VALUES_LEGACY));
25 Some(values?.as_str())
26 }
27
28 fn pl_categorical_metadata(&self) -> Option<&str> {
29 Some(
34 self.into_metadata_ref()
35 .get(DTYPE_CATEGORICAL_NEW)?
36 .as_str(),
37 )
38 }
39
40 fn maintain_type(&self) -> bool {
41 let metadata = self.into_metadata_ref();
42 metadata.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)
43 }
44}
45
46impl MetaDataExt for Metadata {}
47pub trait IntoMetadata {
48 #[allow(clippy::wrong_self_convention)]
49 fn into_metadata_ref(&self) -> &Metadata;
50}
51
52impl IntoMetadata for Metadata {
53 fn into_metadata_ref(&self) -> &Metadata {
54 self
55 }
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)]
59#[cfg_attr(
60 any(feature = "serde", feature = "serde-lazy"),
61 derive(Serialize, Deserialize)
62)]
63#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
64pub enum UnknownKind {
65 Ufunc,
66 Int(i128),
68 Float,
69 Str,
71 #[default]
72 Any,
73}
74
75impl UnknownKind {
76 pub fn materialize(&self) -> Option<DataType> {
77 let dtype = match self {
78 UnknownKind::Int(v) => materialize_dyn_int(*v).dtype(),
79 UnknownKind::Float => DataType::Float64,
80 UnknownKind::Str => DataType::String,
81 UnknownKind::Any | UnknownKind::Ufunc => return None,
82 };
83 Some(dtype)
84 }
85}
86
87#[derive(Clone)]
88pub enum DataType {
89 Boolean,
90 UInt8,
91 UInt16,
92 UInt32,
93 UInt64,
94 Int8,
95 Int16,
96 Int32,
97 Int64,
98 Int128,
99 Float32,
100 Float64,
101 #[cfg(feature = "dtype-decimal")]
105 Decimal(Option<usize>, Option<usize>), String,
108 Binary,
109 BinaryOffset,
110 Date,
113 Datetime(TimeUnit, Option<TimeZone>),
116 Duration(TimeUnit),
118 Time,
120 #[cfg(feature = "dtype-array")]
122 Array(Box<DataType>, usize),
123 List(Box<DataType>),
125 #[cfg(feature = "object")]
128 Object(&'static str),
129 Null,
130 #[cfg(feature = "dtype-categorical")]
131 Categorical(Arc<Categories>, Arc<CategoricalMapping>),
132 #[cfg(feature = "dtype-categorical")]
134 Enum(Arc<FrozenCategories>, Arc<CategoricalMapping>),
135 #[cfg(feature = "dtype-struct")]
136 Struct(Vec<Field>),
137 Unknown(UnknownKind),
139}
140
141impl Default for DataType {
142 fn default() -> Self {
143 DataType::Unknown(UnknownKind::Any)
144 }
145}
146
147pub trait AsRefDataType {
148 fn as_ref_dtype(&self) -> &DataType;
149}
150
151impl Hash for DataType {
152 fn hash<H: Hasher>(&self, state: &mut H) {
153 std::mem::discriminant(self).hash(state)
154 }
155}
156
157impl PartialEq for DataType {
158 fn eq(&self, other: &Self) -> bool {
159 use DataType::*;
160 {
161 match (self, other) {
162 #[cfg(feature = "dtype-categorical")]
163 (Categorical(cats_l, _), Categorical(cats_r, _)) => Arc::ptr_eq(cats_l, cats_r),
164 #[cfg(feature = "dtype-categorical")]
165 (Enum(fcats_l, _), Enum(fcats_r, _)) => Arc::ptr_eq(fcats_l, fcats_r),
166 (Datetime(tu_l, tz_l), Datetime(tu_r, tz_r)) => tu_l == tu_r && tz_l == tz_r,
167 (List(left_inner), List(right_inner)) => left_inner == right_inner,
168 #[cfg(feature = "dtype-duration")]
169 (Duration(tu_l), Duration(tu_r)) => tu_l == tu_r,
170 #[cfg(feature = "dtype-decimal")]
171 (Decimal(l_prec, l_scale), Decimal(r_prec, r_scale)) => {
172 let is_prec_eq = l_prec.is_none() || r_prec.is_none() || l_prec == r_prec;
173 let is_scale_eq = l_scale.is_none() || r_scale.is_none() || l_scale == r_scale;
174
175 is_prec_eq && is_scale_eq
176 },
177 #[cfg(feature = "object")]
178 (Object(lhs), Object(rhs)) => lhs == rhs,
179 #[cfg(feature = "dtype-struct")]
180 (Struct(lhs), Struct(rhs)) => {
181 std::ptr::eq(Vec::as_ptr(lhs), Vec::as_ptr(rhs)) || lhs == rhs
182 },
183 #[cfg(feature = "dtype-array")]
184 (Array(left_inner, left_width), Array(right_inner, right_width)) => {
185 left_width == right_width && left_inner == right_inner
186 },
187 (Unknown(l), Unknown(r)) => match (l, r) {
188 (UnknownKind::Int(_), UnknownKind::Int(_)) => true,
189 _ => l == r,
190 },
191 _ => std::mem::discriminant(self) == std::mem::discriminant(other),
192 }
193 }
194 }
195}
196
197impl Eq for DataType {}
198
199impl DataType {
200 pub const IDX_DTYPE: Self = {
201 #[cfg(not(feature = "bigidx"))]
202 {
203 DataType::UInt32
204 }
205 #[cfg(feature = "bigidx")]
206 {
207 DataType::UInt64
208 }
209 };
210
211 pub fn value_within_range(&self, other: AnyValue) -> bool {
212 use DataType::*;
213 match self {
214 UInt8 => other.extract::<u8>().is_some(),
215 #[cfg(feature = "dtype-u16")]
216 UInt16 => other.extract::<u16>().is_some(),
217 UInt32 => other.extract::<u32>().is_some(),
218 UInt64 => other.extract::<u64>().is_some(),
219 #[cfg(feature = "dtype-i8")]
220 Int8 => other.extract::<i8>().is_some(),
221 #[cfg(feature = "dtype-i16")]
222 Int16 => other.extract::<i16>().is_some(),
223 Int32 => other.extract::<i32>().is_some(),
224 Int64 => other.extract::<i64>().is_some(),
225 _ => false,
226 }
227 }
228
229 #[cfg(feature = "dtype-struct")]
231 pub fn _month_days_ns_struct_type() -> Self {
232 DataType::Struct(vec![
233 Field::new(PlSmallStr::from_static("months"), DataType::Int32),
234 Field::new(PlSmallStr::from_static("days"), DataType::Int32),
235 Field::new(
236 PlSmallStr::from_static("nanoseconds"),
237 DataType::Duration(TimeUnit::Nanoseconds),
238 ),
239 ])
240 }
241
242 pub fn is_known(&self) -> bool {
244 match self {
245 DataType::List(inner) => inner.is_known(),
246 #[cfg(feature = "dtype-array")]
247 DataType::Array(inner, _) => inner.is_known(),
248 #[cfg(feature = "dtype-struct")]
249 DataType::Struct(fields) => fields.iter().all(|fld| fld.dtype.is_known()),
250 DataType::Unknown(_) => false,
251 _ => true,
252 }
253 }
254
255 pub fn materialize_unknown(self, allow_unknown: bool) -> PolarsResult<DataType> {
258 match self {
259 DataType::Unknown(u) => match u.materialize() {
260 Some(known) => Ok(known),
261 None => {
262 if allow_unknown {
263 Ok(DataType::Unknown(u))
264 } else {
265 polars_bail!(SchemaMismatch: "failed to materialize unknown type")
266 }
267 },
268 },
269 DataType::List(inner) => Ok(DataType::List(Box::new(
270 inner.materialize_unknown(allow_unknown)?,
271 ))),
272 #[cfg(feature = "dtype-array")]
273 DataType::Array(inner, size) => Ok(DataType::Array(
274 Box::new(inner.materialize_unknown(allow_unknown)?),
275 size,
276 )),
277 #[cfg(feature = "dtype-struct")]
278 DataType::Struct(fields) => Ok(DataType::Struct(
279 fields
280 .into_iter()
281 .map(|f| {
282 PolarsResult::Ok(Field::new(
283 f.name,
284 f.dtype.materialize_unknown(allow_unknown)?,
285 ))
286 })
287 .try_collect_vec()?,
288 )),
289 _ => Ok(self),
290 }
291 }
292
293 #[cfg(feature = "dtype-array")]
294 pub fn get_shape(&self) -> Option<Vec<usize>> {
296 fn get_shape_impl(dt: &DataType, shape: &mut Vec<usize>) {
297 if let DataType::Array(inner, size) = dt {
298 shape.push(*size);
299 get_shape_impl(inner, shape);
300 }
301 }
302
303 if let DataType::Array(inner, size) = self {
304 let mut shape = vec![*size];
305 get_shape_impl(inner, &mut shape);
306 Some(shape)
307 } else {
308 None
309 }
310 }
311
312 pub fn inner_dtype(&self) -> Option<&DataType> {
314 match self {
315 DataType::List(inner) => Some(inner),
316 #[cfg(feature = "dtype-array")]
317 DataType::Array(inner, _) => Some(inner),
318 _ => None,
319 }
320 }
321
322 pub fn into_inner_dtype(self) -> Option<DataType> {
324 match self {
325 DataType::List(inner) => Some(*inner),
326 #[cfg(feature = "dtype-array")]
327 DataType::Array(inner, _) => Some(*inner),
328 _ => None,
329 }
330 }
331
332 pub fn try_into_inner_dtype(self) -> PolarsResult<DataType> {
334 match self {
335 DataType::List(inner) => Ok(*inner),
336 #[cfg(feature = "dtype-array")]
337 DataType::Array(inner, _) => Ok(*inner),
338 dt => polars_bail!(InvalidOperation: "cannot get inner datatype of `{dt}`"),
339 }
340 }
341
342 pub fn leaf_dtype(&self) -> &DataType {
344 let mut prev = self;
345 while let Some(dtype) = prev.inner_dtype() {
346 prev = dtype
347 }
348 prev
349 }
350
351 #[cfg(feature = "dtype-array")]
352 pub fn array_leaf_dtype(&self) -> Option<&DataType> {
354 let mut prev = self;
355 match prev {
356 DataType::Array(_, _) => {
357 while let DataType::Array(inner, _) = &prev {
358 prev = inner;
359 }
360 Some(prev)
361 },
362 _ => None,
363 }
364 }
365
366 pub fn cast_leaf(&self, to: DataType) -> DataType {
368 use DataType::*;
369 match self {
370 List(inner) => List(Box::new(inner.cast_leaf(to))),
371 #[cfg(feature = "dtype-array")]
372 Array(inner, size) => Array(Box::new(inner.cast_leaf(to)), *size),
373 _ => to,
374 }
375 }
376
377 pub fn can_cast_to(&self, to: &DataType) -> Option<bool> {
381 if self == to {
382 return Some(true);
383 }
384 if self.is_primitive_numeric() && to.is_primitive_numeric() {
385 return Some(true);
386 }
387
388 if self.is_null() {
389 return Some(true);
390 }
391
392 use DataType as D;
393 Some(match (self, to) {
394 #[cfg(feature = "dtype-categorical")]
395 (D::Categorical(_, _) | D::Enum(_, _), D::Binary)
396 | (D::Binary, D::Categorical(_, _) | D::Enum(_, _)) => false, #[cfg(feature = "object")]
399 (D::Object(_), D::Object(_)) => true,
400 #[cfg(feature = "object")]
401 (D::Object(_), _) | (_, D::Object(_)) => false,
402
403 (D::Boolean, dt) | (dt, D::Boolean) => match dt {
404 dt if dt.is_primitive_numeric() => true,
405 #[cfg(feature = "dtype-decimal")]
406 D::Decimal(_, _) => true,
407 D::String | D::Binary => true,
408 _ => false,
409 },
410
411 (D::List(from), D::List(to)) => from.can_cast_to(to)?,
412 #[cfg(feature = "dtype-array")]
413 (D::Array(from, l_width), D::Array(to, r_width)) => {
414 l_width == r_width && from.can_cast_to(to)?
415 },
416 #[cfg(feature = "dtype-struct")]
417 (D::Struct(l_fields), D::Struct(r_fields)) => {
418 if l_fields.is_empty() {
419 return Some(true);
420 }
421
422 if l_fields.len() != r_fields.len() {
423 return Some(false);
424 }
425
426 for (l, r) in l_fields.iter().zip(r_fields) {
427 if !l.dtype().can_cast_to(r.dtype())? {
428 return Some(false);
429 }
430 }
431
432 true
433 },
434
435 _ => return None,
437 })
438 }
439
440 pub fn implode(self) -> DataType {
441 DataType::List(Box::new(self))
442 }
443
444 #[must_use]
446 pub fn to_physical(&self) -> DataType {
447 use DataType::*;
448 match self {
449 Date => Int32,
450 Datetime(_, _) => Int64,
451 Duration(_) => Int64,
452 Time => Int64,
453 #[cfg(feature = "dtype-decimal")]
454 Decimal(_, _) => Int128,
455 #[cfg(feature = "dtype-categorical")]
456 Categorical(cats, _) => cats.physical().dtype(),
457 #[cfg(feature = "dtype-categorical")]
458 Enum(fcats, _) => fcats.physical().dtype(),
459 #[cfg(feature = "dtype-array")]
460 Array(dt, width) => Array(Box::new(dt.to_physical()), *width),
461 List(dt) => List(Box::new(dt.to_physical())),
462 #[cfg(feature = "dtype-struct")]
463 Struct(fields) => {
464 let new_fields = fields
465 .iter()
466 .map(|s| Field::new(s.name().clone(), s.dtype().to_physical()))
467 .collect();
468 Struct(new_fields)
469 },
470 _ => self.clone(),
471 }
472 }
473
474 pub fn is_supported_list_arithmetic_input(&self) -> bool {
475 self.is_primitive_numeric() || self.is_bool() || self.is_null()
476 }
477
478 pub fn is_logical(&self) -> bool {
480 self != &self.to_physical()
481 }
482
483 pub fn is_temporal(&self) -> bool {
485 use DataType::*;
486 matches!(self, Date | Datetime(_, _) | Duration(_) | Time)
487 }
488
489 pub fn is_primitive(&self) -> bool {
492 self.is_primitive_numeric()
493 | matches!(
494 self,
495 DataType::Boolean | DataType::String | DataType::Binary
496 )
497 }
498
499 pub fn is_primitive_numeric(&self) -> bool {
501 self.is_float() || self.is_integer()
502 }
503
504 pub fn is_bool(&self) -> bool {
506 matches!(self, DataType::Boolean)
507 }
508
509 pub fn is_list(&self) -> bool {
511 matches!(self, DataType::List(_))
512 }
513
514 pub fn is_array(&self) -> bool {
516 #[cfg(feature = "dtype-array")]
517 {
518 matches!(self, DataType::Array(_, _))
519 }
520 #[cfg(not(feature = "dtype-array"))]
521 {
522 false
523 }
524 }
525
526 pub fn is_nested(&self) -> bool {
527 self.is_list() || self.is_struct() || self.is_array()
528 }
529
530 pub fn is_struct(&self) -> bool {
532 #[cfg(feature = "dtype-struct")]
533 {
534 matches!(self, DataType::Struct(_))
535 }
536 #[cfg(not(feature = "dtype-struct"))]
537 {
538 false
539 }
540 }
541
542 pub fn is_binary(&self) -> bool {
543 matches!(self, DataType::Binary)
544 }
545
546 pub fn is_date(&self) -> bool {
547 matches!(self, DataType::Date)
548 }
549 pub fn is_datetime(&self) -> bool {
550 matches!(self, DataType::Datetime(..))
551 }
552
553 pub fn is_duration(&self) -> bool {
554 matches!(self, DataType::Duration(..))
555 }
556
557 pub fn is_object(&self) -> bool {
558 #[cfg(feature = "object")]
559 {
560 matches!(self, DataType::Object(_))
561 }
562 #[cfg(not(feature = "object"))]
563 {
564 false
565 }
566 }
567
568 pub fn is_null(&self) -> bool {
569 matches!(self, DataType::Null)
570 }
571
572 pub fn contains_views(&self) -> bool {
573 use DataType::*;
574 match self {
575 Binary | String => true,
576 List(inner) => inner.contains_views(),
577 #[cfg(feature = "dtype-array")]
578 Array(inner, _) => inner.contains_views(),
579 #[cfg(feature = "dtype-struct")]
580 Struct(fields) => fields.iter().any(|field| field.dtype.contains_views()),
581 _ => false,
582 }
583 }
584
585 pub fn contains_categoricals(&self) -> bool {
586 use DataType::*;
587 match self {
588 #[cfg(feature = "dtype-categorical")]
589 Categorical(_, _) | Enum(_, _) => true,
590 List(inner) => inner.contains_categoricals(),
591 #[cfg(feature = "dtype-array")]
592 Array(inner, _) => inner.contains_categoricals(),
593 #[cfg(feature = "dtype-struct")]
594 Struct(fields) => fields
595 .iter()
596 .any(|field| field.dtype.contains_categoricals()),
597 _ => false,
598 }
599 }
600
601 pub fn contains_objects(&self) -> bool {
602 use DataType::*;
603 match self {
604 #[cfg(feature = "object")]
605 Object(_) => true,
606 List(inner) => inner.contains_objects(),
607 #[cfg(feature = "dtype-array")]
608 Array(inner, _) => inner.contains_objects(),
609 #[cfg(feature = "dtype-struct")]
610 Struct(fields) => fields.iter().any(|field| field.dtype.contains_objects()),
611 _ => false,
612 }
613 }
614
615 pub fn contains_list_recursive(&self) -> bool {
616 use DataType as D;
617 match self {
618 D::List(_) => true,
619 #[cfg(feature = "dtype-array")]
620 D::Array(inner, _) => inner.contains_list_recursive(),
621 #[cfg(feature = "dtype-struct")]
622 D::Struct(fields) => fields
623 .iter()
624 .any(|field| field.dtype.contains_list_recursive()),
625 _ => false,
626 }
627 }
628
629 pub fn contains_unknown(&self) -> bool {
630 use DataType as D;
631 match self {
632 D::Unknown(_) => true,
633 D::List(inner) => inner.contains_unknown(),
634 #[cfg(feature = "dtype-array")]
635 D::Array(inner, _) => inner.contains_unknown(),
636 #[cfg(feature = "dtype-struct")]
637 D::Struct(fields) => fields.iter().any(|field| field.dtype.contains_unknown()),
638 _ => false,
639 }
640 }
641
642 pub fn is_ord(&self) -> bool {
644 let phys = self.to_physical();
645 phys.is_primitive_numeric()
646 || self.is_decimal()
647 || matches!(
648 phys,
649 DataType::Binary | DataType::String | DataType::Boolean
650 )
651 }
652
653 pub fn is_decimal(&self) -> bool {
655 match self {
656 #[cfg(feature = "dtype-decimal")]
657 DataType::Decimal(_, _) => true,
658 _ => false,
659 }
660 }
661
662 pub fn is_float(&self) -> bool {
665 matches!(
666 self,
667 DataType::Float32 | DataType::Float64 | DataType::Unknown(UnknownKind::Float)
668 )
669 }
670
671 pub fn is_integer(&self) -> bool {
673 matches!(
674 self,
675 DataType::Int8
676 | DataType::Int16
677 | DataType::Int32
678 | DataType::Int64
679 | DataType::Int128
680 | DataType::UInt8
681 | DataType::UInt16
682 | DataType::UInt32
683 | DataType::UInt64
684 | DataType::Unknown(UnknownKind::Int(_))
685 )
686 }
687
688 pub fn is_signed_integer(&self) -> bool {
689 matches!(
691 self,
692 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::Int128
693 )
694 }
695
696 pub fn is_unsigned_integer(&self) -> bool {
697 matches!(
698 self,
699 DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64,
700 )
701 }
702
703 pub fn is_string(&self) -> bool {
704 matches!(self, DataType::String | DataType::Unknown(UnknownKind::Str))
705 }
706
707 pub fn is_categorical(&self) -> bool {
708 #[cfg(feature = "dtype-categorical")]
709 {
710 matches!(self, DataType::Categorical(_, _))
711 }
712 #[cfg(not(feature = "dtype-categorical"))]
713 {
714 false
715 }
716 }
717
718 pub fn is_enum(&self) -> bool {
719 #[cfg(feature = "dtype-categorical")]
720 {
721 matches!(self, DataType::Enum(_, _))
722 }
723 #[cfg(not(feature = "dtype-categorical"))]
724 {
725 false
726 }
727 }
728
729 pub fn to_arrow_field(&self, name: PlSmallStr, compat_level: CompatLevel) -> ArrowField {
731 let metadata = match self {
732 #[cfg(feature = "dtype-categorical")]
733 DataType::Enum(fcats, _map) => {
734 let cats = fcats.categories();
735 let strings_size: usize = cats
736 .values_iter()
737 .map(|s| (s.len() + 1).ilog10() as usize + 1 + s.len())
738 .sum();
739 let mut encoded = String::with_capacity(strings_size);
740 for cat in cats.values_iter() {
741 encoded.push_str(itoa::Buffer::new().format(cat.len()));
742 encoded.push(';');
743 encoded.push_str(cat);
744 }
745 Some(BTreeMap::from([(
746 PlSmallStr::from_static(DTYPE_ENUM_VALUES_NEW),
747 PlSmallStr::from_string(encoded),
748 )]))
749 },
750 #[cfg(feature = "dtype-categorical")]
751 DataType::Categorical(cats, _) => {
752 let mut encoded = String::new();
753 encoded.push_str(itoa::Buffer::new().format(cats.name().len()));
754 encoded.push(';');
755 encoded.push_str(cats.name());
756 encoded.push_str(itoa::Buffer::new().format(cats.namespace().len()));
757 encoded.push(';');
758 encoded.push_str(cats.namespace());
759 encoded.push_str(cats.physical().as_str());
760 encoded.push(';');
761
762 Some(BTreeMap::from([(
763 PlSmallStr::from_static(DTYPE_CATEGORICAL_NEW),
764 PlSmallStr::from_string(encoded),
765 )]))
766 },
767 DataType::BinaryOffset => Some(BTreeMap::from([(
768 PlSmallStr::from_static(PL_KEY),
769 PlSmallStr::from_static(MAINTAIN_PL_TYPE),
770 )])),
771 _ => None,
772 };
773
774 let field = ArrowField::new(name, self.to_arrow(compat_level), true);
775
776 if let Some(metadata) = metadata {
777 field.with_metadata(metadata)
778 } else {
779 field
780 }
781 }
782
783 pub fn max(&self) -> PolarsResult<Scalar> {
785 use DataType::*;
786 let v = match self {
787 Int8 => Scalar::from(i8::MAX),
788 Int16 => Scalar::from(i16::MAX),
789 Int32 => Scalar::from(i32::MAX),
790 Int64 => Scalar::from(i64::MAX),
791 Int128 => Scalar::from(i128::MAX),
792 UInt8 => Scalar::from(u8::MAX),
793 UInt16 => Scalar::from(u16::MAX),
794 UInt32 => Scalar::from(u32::MAX),
795 UInt64 => Scalar::from(u64::MAX),
796 Float32 => Scalar::from(f32::INFINITY),
797 Float64 => Scalar::from(f64::INFINITY),
798 #[cfg(feature = "dtype-time")]
799 Time => Scalar::new(Time, AnyValue::Time(NS_IN_DAY - 1)),
800 dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt),
801 };
802 Ok(v)
803 }
804
805 pub fn min(&self) -> PolarsResult<Scalar> {
807 use DataType::*;
808 let v = match self {
809 Int8 => Scalar::from(i8::MIN),
810 Int16 => Scalar::from(i16::MIN),
811 Int32 => Scalar::from(i32::MIN),
812 Int64 => Scalar::from(i64::MIN),
813 Int128 => Scalar::from(i128::MIN),
814 UInt8 => Scalar::from(u8::MIN),
815 UInt16 => Scalar::from(u16::MIN),
816 UInt32 => Scalar::from(u32::MIN),
817 UInt64 => Scalar::from(u64::MIN),
818 Float32 => Scalar::from(f32::NEG_INFINITY),
819 Float64 => Scalar::from(f64::NEG_INFINITY),
820 #[cfg(feature = "dtype-time")]
821 Time => Scalar::new(Time, AnyValue::Time(0)),
822 dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt),
823 };
824 Ok(v)
825 }
826
827 #[inline]
829 pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType {
830 self.try_to_arrow(compat_level).unwrap()
831 }
832
833 #[inline]
834 pub fn try_to_arrow(&self, compat_level: CompatLevel) -> PolarsResult<ArrowDataType> {
835 use DataType::*;
836 match self {
837 Boolean => Ok(ArrowDataType::Boolean),
838 UInt8 => Ok(ArrowDataType::UInt8),
839 UInt16 => Ok(ArrowDataType::UInt16),
840 UInt32 => Ok(ArrowDataType::UInt32),
841 UInt64 => Ok(ArrowDataType::UInt64),
842 Int8 => Ok(ArrowDataType::Int8),
843 Int16 => Ok(ArrowDataType::Int16),
844 Int32 => Ok(ArrowDataType::Int32),
845 Int64 => Ok(ArrowDataType::Int64),
846 Int128 => Ok(ArrowDataType::Int128),
847 Float32 => Ok(ArrowDataType::Float32),
848 Float64 => Ok(ArrowDataType::Float64),
849 #[cfg(feature = "dtype-decimal")]
850 Decimal(precision, scale) => {
851 let precision = (*precision).unwrap_or(38);
852 polars_ensure!(precision <= 38 && precision > 0, InvalidOperation: "decimal precision should be <= 38 & >= 1");
853
854 Ok(ArrowDataType::Decimal(
855 precision,
856 scale.unwrap_or(0), ))
858 },
859 String => {
860 let dt = if compat_level.0 >= 1 {
861 ArrowDataType::Utf8View
862 } else {
863 ArrowDataType::LargeUtf8
864 };
865 Ok(dt)
866 },
867 Binary => {
868 let dt = if compat_level.0 >= 1 {
869 ArrowDataType::BinaryView
870 } else {
871 ArrowDataType::LargeBinary
872 };
873 Ok(dt)
874 },
875 Date => Ok(ArrowDataType::Date32),
876 Datetime(unit, tz) => Ok(ArrowDataType::Timestamp(
877 unit.to_arrow(),
878 tz.as_deref().cloned(),
879 )),
880 Duration(unit) => Ok(ArrowDataType::Duration(unit.to_arrow())),
881 Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)),
882 #[cfg(feature = "dtype-array")]
883 Array(dt, size) => Ok(dt
884 .try_to_arrow(compat_level)?
885 .to_fixed_size_list(*size, true)),
886 List(dt) => Ok(ArrowDataType::LargeList(Box::new(
887 dt.to_arrow_field(LIST_VALUES_NAME, compat_level),
888 ))),
889 Null => Ok(ArrowDataType::Null),
890 #[cfg(feature = "object")]
891 Object(_) => Ok(get_object_physical_type()),
892 #[cfg(feature = "dtype-categorical")]
893 Categorical(_, _) | Enum(_, _) => {
894 let arrow_phys = match self.cat_physical().unwrap() {
895 CategoricalPhysical::U8 => IntegerType::UInt8,
896 CategoricalPhysical::U16 => IntegerType::UInt16,
897 CategoricalPhysical::U32 => IntegerType::UInt32,
898 };
899
900 let values = if compat_level.0 >= 1 {
901 ArrowDataType::Utf8View
902 } else {
903 ArrowDataType::LargeUtf8
904 };
905
906 Ok(ArrowDataType::Dictionary(
907 arrow_phys,
908 Box::new(values),
909 false,
910 ))
911 },
912 #[cfg(feature = "dtype-struct")]
913 Struct(fields) => {
914 let fields = fields
915 .iter()
916 .map(|fld| fld.to_arrow(compat_level))
917 .collect();
918 Ok(ArrowDataType::Struct(fields))
919 },
920 BinaryOffset => Ok(ArrowDataType::LargeBinary),
921 Unknown(kind) => {
922 let dt = match kind {
923 UnknownKind::Any | UnknownKind::Ufunc => ArrowDataType::Unknown,
924 UnknownKind::Float => ArrowDataType::Float64,
925 UnknownKind::Str => ArrowDataType::Utf8View,
926 UnknownKind::Int(v) => {
927 return materialize_dyn_int(*v).dtype().try_to_arrow(compat_level);
928 },
929 };
930 Ok(dt)
931 },
932 }
933 }
934
935 pub fn is_nested_null(&self) -> bool {
936 use DataType::*;
937 match self {
938 Null => true,
939 List(field) => field.is_nested_null(),
940 #[cfg(feature = "dtype-array")]
941 Array(field, _) => field.is_nested_null(),
942 #[cfg(feature = "dtype-struct")]
943 Struct(fields) => fields.iter().all(|fld| fld.dtype.is_nested_null()),
944 _ => false,
945 }
946 }
947
948 pub fn matches_schema_type(&self, schema_type: &DataType) -> PolarsResult<bool> {
955 match (self, schema_type) {
956 (DataType::List(l), DataType::List(r)) => l.matches_schema_type(r),
957 #[cfg(feature = "dtype-array")]
958 (DataType::Array(l, sl), DataType::Array(r, sr)) => {
959 Ok(l.matches_schema_type(r)? && sl == sr)
960 },
961 #[cfg(feature = "dtype-struct")]
962 (DataType::Struct(l), DataType::Struct(r)) => {
963 if l.len() != r.len() {
964 polars_bail!(SchemaMismatch: "structs have different number of fields: {} vs {}", l.len(), r.len());
965 }
966 let mut must_cast = false;
967 for (l, r) in l.iter().zip(r.iter()) {
968 must_cast |= l.dtype.matches_schema_type(&r.dtype)?;
969 }
970 Ok(must_cast)
971 },
972 (DataType::Null, DataType::Null) => Ok(false),
973 #[cfg(feature = "dtype-decimal")]
974 (DataType::Decimal(_, s1), DataType::Decimal(_, s2)) => Ok(s1 != s2),
975 (DataType::Null, _) => Ok(true),
978 #[cfg(feature = "dtype-categorical")]
979 (DataType::Categorical(l, _), DataType::Categorical(r, _)) => {
980 ensure_same_categories(l, r)?;
981 Ok(false)
982 },
983 #[cfg(feature = "dtype-categorical")]
984 (DataType::Enum(l, _), DataType::Enum(r, _)) => {
985 ensure_same_frozen_categories(l, r)?;
986 Ok(false)
987 },
988
989 (l, r) if l == r => Ok(false),
990 (l, r) => {
991 polars_bail!(SchemaMismatch: "type {:?} is incompatible with expected type {:?}", l, r)
992 },
993 }
994 }
995
996 #[inline]
997 pub fn is_unknown(&self) -> bool {
998 matches!(self, DataType::Unknown(_))
999 }
1000
1001 pub fn nesting_level(&self) -> usize {
1002 let mut level = 0;
1003 let mut slf = self;
1004 while let Some(inner_dtype) = slf.inner_dtype() {
1005 level += 1;
1006 slf = inner_dtype;
1007 }
1008 level
1009 }
1010
1011 #[cfg(feature = "dtype-categorical")]
1013 pub fn cat_physical(&self) -> PolarsResult<CategoricalPhysical> {
1014 match self {
1015 DataType::Categorical(cats, _) => Ok(cats.physical()),
1016 DataType::Enum(fcats, _) => Ok(fcats.physical()),
1017 _ => {
1018 polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
1019 },
1020 }
1021 }
1022
1023 #[cfg(feature = "dtype-categorical")]
1025 pub fn cat_mapping(&self) -> PolarsResult<&Arc<CategoricalMapping>> {
1026 match self {
1027 DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => Ok(mapping),
1028 _ => {
1029 polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
1030 },
1031 }
1032 }
1033
1034 #[cfg(feature = "dtype-categorical")]
1035 pub fn from_categories(cats: Arc<Categories>) -> Self {
1036 let mapping = cats.mapping();
1037 Self::Categorical(cats, mapping)
1038 }
1039
1040 #[cfg(feature = "dtype-categorical")]
1041 pub fn from_frozen_categories(fcats: Arc<FrozenCategories>) -> Self {
1042 let mapping = fcats.mapping().clone();
1043 Self::Enum(fcats, mapping)
1044 }
1045
1046 pub fn is_numeric(&self) -> bool {
1047 self.is_integer() || self.is_float() || self.is_decimal()
1048 }
1049}
1050
1051impl Display for DataType {
1052 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1053 let s = match self {
1054 DataType::Null => "null",
1055 DataType::Boolean => "bool",
1056 DataType::UInt8 => "u8",
1057 DataType::UInt16 => "u16",
1058 DataType::UInt32 => "u32",
1059 DataType::UInt64 => "u64",
1060 DataType::Int8 => "i8",
1061 DataType::Int16 => "i16",
1062 DataType::Int32 => "i32",
1063 DataType::Int64 => "i64",
1064 DataType::Int128 => "i128",
1065 DataType::Float32 => "f32",
1066 DataType::Float64 => "f64",
1067 #[cfg(feature = "dtype-decimal")]
1068 DataType::Decimal(precision, scale) => {
1069 return match (precision, scale) {
1070 (Some(precision), Some(scale)) => {
1071 f.write_str(&format!("decimal[{precision},{scale}]"))
1072 },
1073 (None, Some(scale)) => f.write_str(&format!("decimal[*,{scale}]")),
1074 _ => f.write_str("decimal[?]"), };
1076 },
1077 DataType::String => "str",
1078 DataType::Binary => "binary",
1079 DataType::Date => "date",
1080 DataType::Datetime(tu, tz) => {
1081 let s = match tz {
1082 None => format!("datetime[{tu}]"),
1083 Some(tz) => format!("datetime[{tu}, {tz}]"),
1084 };
1085 return f.write_str(&s);
1086 },
1087 DataType::Duration(tu) => return write!(f, "duration[{tu}]"),
1088 DataType::Time => "time",
1089 #[cfg(feature = "dtype-array")]
1090 DataType::Array(_, _) => {
1091 let tp = self.array_leaf_dtype().unwrap();
1092
1093 let dims = self.get_shape().unwrap();
1094 let shape = if dims.len() == 1 {
1095 format!("{}", dims[0])
1096 } else {
1097 format_tuple!(dims)
1098 };
1099 return write!(f, "array[{tp}, {shape}]");
1100 },
1101 DataType::List(tp) => return write!(f, "list[{tp}]"),
1102 #[cfg(feature = "object")]
1103 DataType::Object(s) => s,
1104 #[cfg(feature = "dtype-categorical")]
1105 DataType::Categorical(_, _) => "cat",
1106 #[cfg(feature = "dtype-categorical")]
1107 DataType::Enum(_, _) => "enum",
1108 #[cfg(feature = "dtype-struct")]
1109 DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()),
1110 DataType::Unknown(kind) => match kind {
1111 UnknownKind::Ufunc => "unknown ufunc",
1112 UnknownKind::Any => "unknown",
1113 UnknownKind::Int(_) => "dyn int",
1114 UnknownKind::Float => "dyn float",
1115 UnknownKind::Str => "dyn str",
1116 },
1117 DataType::BinaryOffset => "binary[offset]",
1118 };
1119 f.write_str(s)
1120 }
1121}
1122
1123impl std::fmt::Debug for DataType {
1124 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1125 use DataType::*;
1126 match self {
1127 Boolean => write!(f, "Boolean"),
1128 UInt8 => write!(f, "UInt8"),
1129 UInt16 => write!(f, "UInt16"),
1130 UInt32 => write!(f, "UInt32"),
1131 UInt64 => write!(f, "UInt64"),
1132 Int8 => write!(f, "Int8"),
1133 Int16 => write!(f, "Int16"),
1134 Int32 => write!(f, "Int32"),
1135 Int64 => write!(f, "Int64"),
1136 Int128 => write!(f, "Int128"),
1137 Float32 => write!(f, "Float32"),
1138 Float64 => write!(f, "Float64"),
1139 String => write!(f, "String"),
1140 Binary => write!(f, "Binary"),
1141 BinaryOffset => write!(f, "BinaryOffset"),
1142 Date => write!(f, "Date"),
1143 Time => write!(f, "Time"),
1144 Duration(unit) => write!(f, "Duration('{unit}')"),
1145 Datetime(unit, opt_tz) => {
1146 if let Some(tz) = opt_tz {
1147 write!(f, "Datetime('{unit}', '{tz}')")
1148 } else {
1149 write!(f, "Datetime('{unit}')")
1150 }
1151 },
1152 #[cfg(feature = "dtype-decimal")]
1153 Decimal(opt_p, opt_s) => match (opt_p, opt_s) {
1154 (None, None) => write!(f, "Decimal(None, None)"),
1155 (None, Some(s)) => write!(f, "Decimal(None, {s})"),
1156 (Some(p), None) => write!(f, "Decimal({p}, None)"),
1157 (Some(p), Some(s)) => write!(f, "Decimal({p}, {s})"),
1158 },
1159 #[cfg(feature = "dtype-array")]
1160 Array(inner, size) => write!(f, "Array({inner:?}, {size})"),
1161 List(inner) => write!(f, "List({inner:?})"),
1162 #[cfg(feature = "dtype-struct")]
1163 Struct(fields) => {
1164 let mut first = true;
1165 write!(f, "Struct({{")?;
1166 for field in fields {
1167 if !first {
1168 write!(f, ", ")?;
1169 }
1170 write!(f, "'{}': {:?}", field.name(), field.dtype())?;
1171 first = false;
1172 }
1173 write!(f, "}})")
1174 },
1175 #[cfg(feature = "dtype-categorical")]
1176 Categorical(cats, _) => {
1177 if cats.is_global() {
1178 write!(f, "Categorical")
1179 } else if cats.namespace().is_empty() && cats.physical() == CategoricalPhysical::U32
1180 {
1181 write!(f, "Categorical('{}')", cats.name())
1182 } else {
1183 write!(
1184 f,
1185 "Categorical('{}', '{}', {:?})",
1186 cats.name(),
1187 cats.namespace(),
1188 cats.physical()
1189 )
1190 }
1191 },
1192 #[cfg(feature = "dtype-categorical")]
1193 Enum(_, _) => write!(f, "Enum([...])"),
1194 #[cfg(feature = "object")]
1195 Object(_) => write!(f, "Object"),
1196 Null => write!(f, "Null"),
1197 Unknown(kind) => write!(f, "Unknown({kind:?})"),
1198 }
1199 }
1200}
1201
1202pub fn merge_dtypes(left: &DataType, right: &DataType) -> PolarsResult<DataType> {
1203 use DataType::*;
1204 Ok(match (left, right) {
1205 #[cfg(feature = "dtype-categorical")]
1206 (Categorical(cats_l, map), Categorical(cats_r, _)) => {
1207 ensure_same_categories(cats_l, cats_r)?;
1208 Categorical(cats_l.clone(), map.clone())
1209 },
1210 #[cfg(feature = "dtype-categorical")]
1211 (Enum(fcats_l, map), Enum(fcats_r, _)) => {
1212 ensure_same_frozen_categories(fcats_l, fcats_r)?;
1213 Enum(fcats_l.clone(), map.clone())
1214 },
1215 (List(inner_l), List(inner_r)) => {
1216 let merged = merge_dtypes(inner_l, inner_r)?;
1217 List(Box::new(merged))
1218 },
1219 #[cfg(feature = "dtype-struct")]
1220 (Struct(inner_l), Struct(inner_r)) => {
1221 polars_ensure!(inner_l.len() == inner_r.len(), ComputeError: "cannot combine structs with differing amounts of fields ({} != {})", inner_l.len(), inner_r.len());
1222 let fields = inner_l.iter().zip(inner_r.iter()).map(|(l, r)| {
1223 polars_ensure!(l.name() == r.name(), ComputeError: "cannot combine structs with different fields ({} != {})", l.name(), r.name());
1224 let merged = merge_dtypes(l.dtype(), r.dtype())?;
1225 Ok(Field::new(l.name().clone(), merged))
1226 }).collect::<PolarsResult<Vec<_>>>()?;
1227 Struct(fields)
1228 },
1229 #[cfg(feature = "dtype-array")]
1230 (Array(inner_l, width_l), Array(inner_r, width_r)) => {
1231 polars_ensure!(width_l == width_r, ComputeError: "widths of FixedSizeWidth Series are not equal");
1232 let merged = merge_dtypes(inner_l, inner_r)?;
1233 Array(Box::new(merged), *width_l)
1234 },
1235 (left, right) if left == right => left.clone(),
1236 _ => polars_bail!(ComputeError: "unable to merge datatypes"),
1237 })
1238}
1239
1240fn collect_nested_types(
1241 dtype: &DataType,
1242 result: &mut PlHashSet<DataType>,
1243 include_compound_types: bool,
1244) {
1245 match dtype {
1246 DataType::List(inner) => {
1247 if include_compound_types {
1248 result.insert(dtype.clone());
1249 }
1250 collect_nested_types(inner, result, include_compound_types);
1251 },
1252 #[cfg(feature = "dtype-array")]
1253 DataType::Array(inner, _) => {
1254 if include_compound_types {
1255 result.insert(dtype.clone());
1256 }
1257 collect_nested_types(inner, result, include_compound_types);
1258 },
1259 #[cfg(feature = "dtype-struct")]
1260 DataType::Struct(fields) => {
1261 if include_compound_types {
1262 result.insert(dtype.clone());
1263 }
1264 for field in fields {
1265 collect_nested_types(field.dtype(), result, include_compound_types);
1266 }
1267 },
1268 _ => {
1269 result.insert(dtype.clone());
1270 },
1271 }
1272}
1273
1274pub fn unpack_dtypes(dtype: &DataType, include_compound_types: bool) -> PlHashSet<DataType> {
1275 let mut result = PlHashSet::new();
1276 collect_nested_types(dtype, &mut result, include_compound_types);
1277 result
1278}
1279
1280#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
1281#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
1282#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
1283pub struct CompatLevel(pub(crate) u16);
1284
1285impl CompatLevel {
1286 pub const fn newest() -> CompatLevel {
1287 CompatLevel(1)
1288 }
1289
1290 pub const fn oldest() -> CompatLevel {
1291 CompatLevel(0)
1292 }
1293
1294 #[doc(hidden)]
1297 pub fn with_level(level: u16) -> PolarsResult<CompatLevel> {
1298 if level > CompatLevel::newest().0 {
1299 polars_bail!(InvalidOperation: "invalid compat level");
1300 }
1301 Ok(CompatLevel(level))
1302 }
1303
1304 #[doc(hidden)]
1305 pub fn get_level(&self) -> u16 {
1306 self.0
1307 }
1308}
1309
1310#[cfg(test)]
1311mod tests {
1312 use super::*;
1313
1314 #[cfg(feature = "dtype-array")]
1315 #[test]
1316 fn test_unpack_primitive_dtypes() {
1317 let inner_type = DataType::Float64;
1318 let array_type = DataType::Array(Box::new(inner_type), 10);
1319 let list_type = DataType::List(Box::new(array_type));
1320
1321 let result = unpack_dtypes(&list_type, false);
1322
1323 let mut expected = PlHashSet::new();
1324 expected.insert(DataType::Float64);
1325
1326 assert_eq!(result, expected)
1327 }
1328
1329 #[cfg(feature = "dtype-array")]
1330 #[test]
1331 fn test_unpack_compound_dtypes() {
1332 let inner_type = DataType::Float64;
1333 let array_type = DataType::Array(Box::new(inner_type), 10);
1334 let list_type = DataType::List(Box::new(array_type.clone()));
1335
1336 let result = unpack_dtypes(&list_type, true);
1337
1338 let mut expected = PlHashSet::new();
1339 expected.insert(list_type);
1340 expected.insert(array_type);
1341 expected.insert(DataType::Float64);
1342
1343 assert_eq!(result, expected)
1344 }
1345}