polars_python/conversion/
any_value.rs

1use std::borrow::{Borrow, Cow};
2use std::sync::{Arc, Mutex};
3
4use chrono::{
5    DateTime, Datelike, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike,
6};
7use chrono_tz::Tz;
8use hashbrown::HashMap;
9#[cfg(feature = "object")]
10use polars::chunked_array::object::PolarsObjectSafe;
11#[cfg(feature = "object")]
12use polars::datatypes::OwnedObject;
13use polars::datatypes::{DataType, Field, TimeUnit};
14use polars::prelude::{AnyValue, PlSmallStr, Series, TimeZone};
15use polars_compute::decimal::{DEC128_MAX_PREC, DecimalFmtBuffer, dec128_fits};
16use polars_core::utils::any_values_to_supertype_and_n_dtypes;
17use polars_core::utils::arrow::temporal_conversions::date32_to_date;
18use polars_utils::aliases::PlFixedStateQuality;
19use pyo3::exceptions::{PyOverflowError, PyTypeError, PyValueError};
20use pyo3::prelude::*;
21use pyo3::sync::PyOnceLock;
22use pyo3::types::{
23    PyBool, PyBytes, PyDate, PyDateTime, PyDelta, PyDict, PyFloat, PyInt, PyList, PyMapping,
24    PyRange, PySequence, PyString, PyTime, PyTuple, PyType, PyTzInfo,
25};
26use pyo3::{IntoPyObjectExt, PyTypeCheck, intern};
27
28use super::datetime::{
29    datetime_to_py_object, elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime,
30};
31use super::{ObjectValue, Wrap, struct_dict};
32use crate::error::PyPolarsErr;
33use crate::py_modules::{pl_series, pl_utils};
34use crate::series::PySeries;
35
36impl<'py> IntoPyObject<'py> for Wrap<AnyValue<'_>> {
37    type Target = PyAny;
38    type Output = Bound<'py, Self::Target>;
39    type Error = PyErr;
40
41    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
42        any_value_into_py_object(self.0, py)
43    }
44}
45
46impl<'py> IntoPyObject<'py> for &Wrap<AnyValue<'_>> {
47    type Target = PyAny;
48    type Output = Bound<'py, Self::Target>;
49    type Error = PyErr;
50
51    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
52        self.clone().into_pyobject(py)
53    }
54}
55
56impl<'py> FromPyObject<'py> for Wrap<AnyValue<'static>> {
57    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
58        py_object_to_any_value(ob, true, true).map(Wrap)
59    }
60}
61
62pub(crate) fn any_value_into_py_object<'py>(
63    av: AnyValue<'_>,
64    py: Python<'py>,
65) -> PyResult<Bound<'py, PyAny>> {
66    let utils = pl_utils(py).bind(py);
67    match av {
68        AnyValue::UInt8(v) => v.into_bound_py_any(py),
69        AnyValue::UInt16(v) => v.into_bound_py_any(py),
70        AnyValue::UInt32(v) => v.into_bound_py_any(py),
71        AnyValue::UInt64(v) => v.into_bound_py_any(py),
72        AnyValue::UInt128(v) => v.into_bound_py_any(py),
73        AnyValue::Int8(v) => v.into_bound_py_any(py),
74        AnyValue::Int16(v) => v.into_bound_py_any(py),
75        AnyValue::Int32(v) => v.into_bound_py_any(py),
76        AnyValue::Int64(v) => v.into_bound_py_any(py),
77        AnyValue::Int128(v) => v.into_bound_py_any(py),
78        AnyValue::Float32(v) => v.into_bound_py_any(py),
79        AnyValue::Float64(v) => v.into_bound_py_any(py),
80        AnyValue::Null => py.None().into_bound_py_any(py),
81        AnyValue::Boolean(v) => v.into_bound_py_any(py),
82        AnyValue::String(v) => v.into_bound_py_any(py),
83        AnyValue::StringOwned(v) => v.into_bound_py_any(py),
84        AnyValue::Categorical(cat, map) | AnyValue::Enum(cat, map) => unsafe {
85            map.cat_to_str_unchecked(cat).into_bound_py_any(py)
86        },
87        AnyValue::CategoricalOwned(cat, map) | AnyValue::EnumOwned(cat, map) => unsafe {
88            map.cat_to_str_unchecked(cat).into_bound_py_any(py)
89        },
90        AnyValue::Date(v) => {
91            let date = date32_to_date(v);
92            date.into_bound_py_any(py)
93        },
94        AnyValue::Datetime(v, time_unit, time_zone) => {
95            datetime_to_py_object(py, v, time_unit, time_zone)
96        },
97        AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
98            datetime_to_py_object(py, v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
99        },
100        AnyValue::Duration(v, time_unit) => {
101            let time_delta = elapsed_offset_to_timedelta(v, time_unit);
102            time_delta.into_bound_py_any(py)
103        },
104        AnyValue::Time(v) => nanos_since_midnight_to_naivetime(v).into_bound_py_any(py),
105        AnyValue::Array(v, _) | AnyValue::List(v) => PySeries::new(v).to_list(py),
106        ref av @ AnyValue::Struct(_, _, flds) => {
107            Ok(struct_dict(py, av._iter_struct_av(), flds)?.into_any())
108        },
109        AnyValue::StructOwned(payload) => {
110            Ok(struct_dict(py, payload.0.into_iter(), &payload.1)?.into_any())
111        },
112        #[cfg(feature = "object")]
113        AnyValue::Object(v) => {
114            let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
115            Ok(object.inner.clone_ref(py).into_bound(py))
116        },
117        #[cfg(feature = "object")]
118        AnyValue::ObjectOwned(v) => {
119            let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
120            Ok(object.inner.clone_ref(py).into_bound(py))
121        },
122        AnyValue::Binary(v) => PyBytes::new(py, v).into_bound_py_any(py),
123        AnyValue::BinaryOwned(v) => PyBytes::new(py, &v).into_bound_py_any(py),
124        AnyValue::Decimal(v, prec, scale) => {
125            let convert = utils.getattr(intern!(py, "to_py_decimal"))?;
126            let mut buf = DecimalFmtBuffer::new();
127            let s = buf.format_dec128(v, scale, false, false);
128            convert.call1((prec, s))
129        },
130    }
131}
132
133/// Holds a Python type object and implements hashing / equality based on the pointer address of the
134/// type object. This is used as a hashtable key instead of only the `usize` pointer value, as we
135/// need to hold a ref to the Python type object to keep it alive.
136#[derive(Debug)]
137pub struct TypeObjectKey {
138    #[allow(unused)]
139    type_object: Py<PyType>,
140    /// We need to store this in a field for `Borrow<usize>`
141    address: usize,
142}
143
144impl TypeObjectKey {
145    fn new(type_object: Py<PyType>) -> Self {
146        let address = type_object.as_ptr() as usize;
147        Self {
148            type_object,
149            address,
150        }
151    }
152}
153
154impl PartialEq for TypeObjectKey {
155    fn eq(&self, other: &Self) -> bool {
156        self.address == other.address
157    }
158}
159
160impl Eq for TypeObjectKey {}
161
162impl std::borrow::Borrow<usize> for TypeObjectKey {
163    fn borrow(&self) -> &usize {
164        &self.address
165    }
166}
167
168impl std::hash::Hash for TypeObjectKey {
169    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
170        let v: &usize = self.borrow();
171        v.hash(state)
172    }
173}
174
175type InitFn = fn(&Bound<'_, PyAny>, bool) -> PyResult<AnyValue<'static>>;
176pub(crate) static LUT: Mutex<HashMap<TypeObjectKey, InitFn, PlFixedStateQuality>> =
177    Mutex::new(HashMap::with_hasher(PlFixedStateQuality::with_seed(0)));
178
179/// Convert a Python object to an [`AnyValue`].
180pub(crate) fn py_object_to_any_value(
181    ob: &Bound<'_, PyAny>,
182    strict: bool,
183    allow_object: bool,
184) -> PyResult<AnyValue<'static>> {
185    // Conversion functions.
186    fn get_null(_ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
187        Ok(AnyValue::Null)
188    }
189
190    fn get_bool(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
191        let b = ob.extract::<bool>()?;
192        Ok(AnyValue::Boolean(b))
193    }
194
195    fn get_int(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
196        if let Ok(v) = ob.extract::<i64>() {
197            Ok(AnyValue::Int64(v))
198        } else if let Ok(v) = ob.extract::<i128>() {
199            Ok(AnyValue::Int128(v))
200        } else if let Ok(v) = ob.extract::<u64>() {
201            Ok(AnyValue::UInt64(v))
202        } else if let Ok(v) = ob.extract::<u128>() {
203            Ok(AnyValue::UInt128(v))
204        } else if !strict {
205            let f = ob.extract::<f64>()?;
206            Ok(AnyValue::Float64(f))
207        } else {
208            Err(PyOverflowError::new_err(format!(
209                "int value too large for Polars integer types: {ob}"
210            )))
211        }
212    }
213
214    fn get_float(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
215        Ok(AnyValue::Float64(ob.extract::<f64>()?))
216    }
217
218    fn get_str(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
219        // Ideally we'd be returning an AnyValue::String(&str) instead, as was
220        // the case in previous versions of this function. However, if compiling
221        // with abi3 for versions older than Python 3.10, the APIs that purport
222        // to return &str actually just encode to UTF-8 as a newly allocated
223        // PyBytes object, and then return reference to that. So what we're
224        // doing here isn't any different fundamentally, and the APIs to for
225        // converting to &str are deprecated in PyO3 0.21.
226        //
227        // Once Python 3.10 is the minimum supported version, converting to &str
228        // will be cheaper, and we should do that. Python 3.9 security updates
229        // end-of-life is Oct 31, 2025.
230        Ok(AnyValue::StringOwned(ob.extract::<String>()?.into()))
231    }
232
233    fn get_bytes(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
234        let value = ob.extract::<Vec<u8>>()?;
235        Ok(AnyValue::BinaryOwned(value))
236    }
237
238    fn get_date(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
239        const UNIX_EPOCH: NaiveDate = DateTime::UNIX_EPOCH.naive_utc().date();
240        let date = ob.extract::<NaiveDate>()?;
241        let elapsed = date.signed_duration_since(UNIX_EPOCH);
242        Ok(AnyValue::Date(elapsed.num_days() as i32))
243    }
244
245    fn get_datetime(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
246        let py = ob.py();
247        let tzinfo = ob.getattr(intern!(py, "tzinfo"))?;
248
249        if tzinfo.is_none() {
250            let datetime = ob.extract::<NaiveDateTime>()?;
251            let delta = datetime - DateTime::UNIX_EPOCH.naive_utc();
252            let timestamp = delta.num_microseconds().unwrap();
253            return Ok(AnyValue::Datetime(timestamp, TimeUnit::Microseconds, None));
254        }
255
256        // Try converting `pytz` timezone to `zoneinfo` timezone
257        let (ob, tzinfo) = if let Some(tz) = tzinfo
258            .getattr(intern!(py, "zone"))
259            .ok()
260            .and_then(|tz| (!tz.is_none()).then_some(tz))
261        {
262            let tzinfo = PyTzInfo::timezone(py, tz.downcast_into::<PyString>()?)?;
263            (
264                &ob.call_method(intern!(py, "astimezone"), (&tzinfo,), None)?,
265                tzinfo,
266            )
267        } else {
268            (ob, tzinfo.downcast_into()?)
269        };
270
271        let (timestamp, tz) = if tzinfo.hasattr(intern!(py, "key"))? {
272            let datetime = ob.extract::<DateTime<Tz>>()?;
273            let tz = unsafe { TimeZone::from_static(datetime.timezone().name()) };
274            if datetime.year() >= 2100 {
275                // chrono-tz does not support dates after 2100
276                // https://github.com/chronotope/chrono-tz/issues/135
277                (
278                    pl_utils(py)
279                        .bind(py)
280                        .getattr(intern!(py, "datetime_to_int"))?
281                        .call1((ob, intern!(py, "us")))?
282                        .extract::<i64>()?,
283                    tz,
284                )
285            } else {
286                let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
287                (delta.num_microseconds().unwrap(), tz)
288            }
289        } else {
290            let datetime = ob.extract::<DateTime<FixedOffset>>()?;
291            let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
292            (delta.num_microseconds().unwrap(), TimeZone::UTC)
293        };
294
295        Ok(AnyValue::DatetimeOwned(
296            timestamp,
297            TimeUnit::Microseconds,
298            Some(Arc::new(tz)),
299        ))
300    }
301
302    fn get_timedelta(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
303        let timedelta = ob.extract::<TimeDelta>()?;
304        if let Some(micros) = timedelta.num_microseconds() {
305            Ok(AnyValue::Duration(micros, TimeUnit::Microseconds))
306        } else {
307            Ok(AnyValue::Duration(
308                timedelta.num_milliseconds(),
309                TimeUnit::Milliseconds,
310            ))
311        }
312    }
313
314    fn get_time(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
315        let time = ob.extract::<NaiveTime>()?;
316
317        Ok(AnyValue::Time(
318            (time.num_seconds_from_midnight() as i64) * 1_000_000_000 + time.nanosecond() as i64,
319        ))
320    }
321
322    fn get_decimal(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
323        fn abs_decimal_from_digits(
324            digits: impl IntoIterator<Item = u8>,
325            exp: i32,
326        ) -> Option<(i128, usize)> {
327            let mut v = 0_i128;
328            for d in digits {
329                v = v.checked_mul(10)?.checked_add(d as i128)?;
330            }
331            let scale = if exp > 0 {
332                v = 10_i128.checked_pow(exp as u32)?.checked_mul(v)?;
333                0
334            } else {
335                (-exp) as usize
336            };
337            dec128_fits(v, DEC128_MAX_PREC).then_some((v, scale))
338        }
339
340        // Note: Using Vec<u8> is not the most efficient thing here (input is a tuple)
341        let (sign, digits, exp): (i8, Vec<u8>, i32) = ob
342            .call_method0(intern!(ob.py(), "as_tuple"))
343            .unwrap()
344            .extract()
345            .unwrap();
346        let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
347            PyErr::from(PyPolarsErr::Other(
348                "Decimal is too large to fit in Decimal128".into(),
349            ))
350        })?;
351        if sign > 0 {
352            v = -v; // Won't overflow since -i128::MAX > i128::MIN
353        }
354        Ok(AnyValue::Decimal(v, DEC128_MAX_PREC, scale))
355    }
356
357    fn get_list(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
358        fn get_list_with_constructor(
359            ob: &Bound<'_, PyAny>,
360            strict: bool,
361        ) -> PyResult<AnyValue<'static>> {
362            // Use the dedicated constructor.
363            // This constructor is able to go via dedicated type constructors
364            // so it can be much faster.
365            let py = ob.py();
366            let kwargs = PyDict::new(py);
367            kwargs.set_item("strict", strict)?;
368            let s = pl_series(py).call(py, (ob,), Some(&kwargs))?;
369            get_list_from_series(s.bind(py), strict)
370        }
371
372        if ob.is_empty()? {
373            Ok(AnyValue::List(Series::new_empty(
374                PlSmallStr::EMPTY,
375                &DataType::Null,
376            )))
377        } else if ob.is_instance_of::<PyList>() | ob.is_instance_of::<PyTuple>() {
378            let list = ob.downcast::<PySequence>()?;
379
380            // Try to find first non-null.
381            let length = list.len()?;
382            let mut iter = list.try_iter()?;
383            let mut avs = Vec::new();
384            for item in &mut iter {
385                let av = py_object_to_any_value(&item?, strict, true)?;
386                let is_null = av.is_null();
387                avs.push(av);
388                if is_null {
389                    break;
390                }
391            }
392
393            // Try to use a faster converter.
394            if let Some(av) = avs.last()
395                && !av.is_null()
396                && av.dtype().is_primitive()
397            {
398                // Always use strict, we will filter the error if we're not
399                // strict and try again using a slower converter with supertype.
400                match get_list_with_constructor(ob, true) {
401                    Ok(ret) => return Ok(ret),
402                    Err(e) => {
403                        if strict {
404                            return Err(e);
405                        }
406                    },
407                }
408            }
409
410            // Push the rest of the anyvalues and use slower converter.
411            avs.reserve(length);
412            for item in &mut iter {
413                avs.push(py_object_to_any_value(&item?, strict, true)?);
414            }
415
416            let (dtype, _n_dtypes) = any_values_to_supertype_and_n_dtypes(&avs)
417                .map_err(|e| PyTypeError::new_err(e.to_string()))?;
418            let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, strict)
419                .map_err(|e| {
420                    PyTypeError::new_err(format!(
421                        "{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types."
422                    ))
423                })?;
424            Ok(AnyValue::List(s))
425        } else {
426            // range will take this branch
427            get_list_with_constructor(ob, strict)
428        }
429    }
430
431    fn get_list_from_series(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
432        let s = super::get_series(ob)?;
433        Ok(AnyValue::List(s))
434    }
435
436    fn get_mapping(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
437        let mapping = ob.downcast::<PyMapping>()?;
438        let len = mapping.len()?;
439        let mut keys = Vec::with_capacity(len);
440        let mut vals = Vec::with_capacity(len);
441
442        for item in mapping.items()?.try_iter()? {
443            let item = item?.downcast_into::<PyTuple>()?;
444            let (key_py, val_py) = (item.get_item(0)?, item.get_item(1)?);
445
446            let key: Cow<str> = key_py.extract()?;
447            let val = py_object_to_any_value(&val_py, strict, true)?;
448
449            keys.push(Field::new(key.as_ref().into(), val.dtype()));
450            vals.push(val);
451        }
452        Ok(AnyValue::StructOwned(Box::new((vals, keys))))
453    }
454
455    fn get_struct(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
456        let dict = ob.downcast::<PyDict>().unwrap();
457        let len = dict.len();
458        let mut keys = Vec::with_capacity(len);
459        let mut vals = Vec::with_capacity(len);
460        for (k, v) in dict.into_iter() {
461            let key = k.extract::<Cow<str>>()?;
462            let val = py_object_to_any_value(&v, strict, true)?;
463            let dtype = val.dtype();
464            keys.push(Field::new(key.as_ref().into(), dtype));
465            vals.push(val)
466        }
467        Ok(AnyValue::StructOwned(Box::new((vals, keys))))
468    }
469
470    fn get_object(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
471        #[cfg(feature = "object")]
472        {
473            // This is slow, but hey don't use objects.
474            let v = &ObjectValue {
475                inner: ob.clone().unbind(),
476            };
477            Ok(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))
478        }
479        #[cfg(not(feature = "object"))]
480        panic!("activate object")
481    }
482
483    /// Determine which conversion function to use for the given object.
484    ///
485    /// Note: This function is only ran if the object's type is not already in the
486    /// lookup table.
487    fn get_conversion_function(ob: &Bound<'_, PyAny>, allow_object: bool) -> PyResult<InitFn> {
488        let py = ob.py();
489        if ob.is_none() {
490            Ok(get_null)
491        }
492        // bool must be checked before int because Python bool is an instance of int.
493        else if ob.is_instance_of::<PyBool>() {
494            Ok(get_bool)
495        } else if ob.is_instance_of::<PyInt>() {
496            Ok(get_int)
497        } else if ob.is_instance_of::<PyFloat>() {
498            Ok(get_float)
499        } else if ob.is_instance_of::<PyString>() {
500            Ok(get_str)
501        } else if ob.is_instance_of::<PyBytes>() {
502            Ok(get_bytes)
503        } else if ob.is_instance_of::<PyList>() || ob.is_instance_of::<PyTuple>() {
504            Ok(get_list)
505        } else if ob.is_instance_of::<PyDict>() {
506            Ok(get_struct)
507        } else if PyMapping::type_check(ob) {
508            Ok(get_mapping)
509        }
510        // note: datetime must be checked *before* date
511        // (as python datetime is an instance of date)
512        else if PyDateTime::type_check(ob) {
513            Ok(get_datetime as InitFn)
514        } else if PyDate::type_check(ob) {
515            Ok(get_date as InitFn)
516        } else if PyTime::type_check(ob) {
517            Ok(get_time as InitFn)
518        } else if PyDelta::type_check(ob) {
519            Ok(get_timedelta as InitFn)
520        } else if ob.is_instance_of::<PyRange>() {
521            Ok(get_list as InitFn)
522        } else {
523            static NDARRAY_TYPE: PyOnceLock<Py<PyType>> = PyOnceLock::new();
524            if let Ok(ndarray_type) = NDARRAY_TYPE.import(py, "numpy", "ndarray") {
525                if ob.is_instance(ndarray_type)? {
526                    // will convert via Series -> mmap_numpy_array
527                    return Ok(get_list as InitFn);
528                }
529            }
530            static DECIMAL_TYPE: PyOnceLock<Py<PyType>> = PyOnceLock::new();
531            if ob.is_instance(DECIMAL_TYPE.import(py, "decimal", "Decimal")?)? {
532                return Ok(get_decimal as InitFn);
533            }
534
535            // support NumPy scalars
536            if ob.extract::<i64>().is_ok() || ob.extract::<u64>().is_ok() {
537                return Ok(get_int as InitFn);
538            } else if ob.extract::<f64>().is_ok() {
539                return Ok(get_float as InitFn);
540            }
541
542            if allow_object {
543                Ok(get_object as InitFn)
544            } else {
545                Err(PyValueError::new_err(format!("Cannot convert {ob}")))
546            }
547        }
548    }
549
550    let py_type = ob.get_type();
551    let py_type_address = py_type.as_ptr() as usize;
552
553    let conversion_func = {
554        if let Some(cached_func) = LUT.lock().unwrap().get(&py_type_address) {
555            *cached_func
556        } else {
557            let k = TypeObjectKey::new(py_type.clone().unbind());
558            assert_eq!(k.address, py_type_address);
559
560            let func = get_conversion_function(ob, allow_object)?;
561            LUT.lock().unwrap().insert(k, func);
562            func
563        }
564    };
565
566    conversion_func(ob, strict)
567}