polars_python/conversion/
any_value.rs

1use std::borrow::{Borrow, Cow};
2use std::sync::{Arc, Mutex};
3
4use chrono::{
5    DateTime, Datelike, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike,
6};
7use chrono_tz::Tz;
8use hashbrown::HashMap;
9#[cfg(feature = "object")]
10use polars::chunked_array::object::PolarsObjectSafe;
11#[cfg(feature = "object")]
12use polars::datatypes::OwnedObject;
13use polars::datatypes::{DataType, Field, TimeUnit};
14use polars::prelude::{AnyValue, PlSmallStr, Series, TimeZone};
15use polars_core::utils::any_values_to_supertype_and_n_dtypes;
16use polars_core::utils::arrow::temporal_conversions::date32_to_date;
17use polars_utils::aliases::PlFixedStateQuality;
18use pyo3::exceptions::{PyOverflowError, PyTypeError, PyValueError};
19use pyo3::prelude::*;
20use pyo3::sync::GILOnceCell;
21use pyo3::types::{
22    PyBool, PyBytes, PyDate, PyDateTime, PyDelta, PyDict, PyFloat, PyInt, PyList, PyMapping,
23    PyRange, PySequence, PyString, PyTime, PyTuple, PyType, PyTzInfo,
24};
25use pyo3::{IntoPyObjectExt, PyTypeCheck, intern};
26
27use super::datetime::{
28    datetime_to_py_object, elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime,
29};
30use super::{ObjectValue, Wrap, decimal_to_digits, struct_dict};
31use crate::error::PyPolarsErr;
32use crate::py_modules::{pl_series, pl_utils};
33use crate::series::PySeries;
34
35impl<'py> IntoPyObject<'py> for Wrap<AnyValue<'_>> {
36    type Target = PyAny;
37    type Output = Bound<'py, Self::Target>;
38    type Error = PyErr;
39
40    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
41        any_value_into_py_object(self.0, py)
42    }
43}
44
45impl<'py> IntoPyObject<'py> for &Wrap<AnyValue<'_>> {
46    type Target = PyAny;
47    type Output = Bound<'py, Self::Target>;
48    type Error = PyErr;
49
50    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
51        self.clone().into_pyobject(py)
52    }
53}
54
55impl<'py> FromPyObject<'py> for Wrap<AnyValue<'static>> {
56    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
57        py_object_to_any_value(ob, true, true).map(Wrap)
58    }
59}
60
61pub(crate) fn any_value_into_py_object<'py>(
62    av: AnyValue<'_>,
63    py: Python<'py>,
64) -> PyResult<Bound<'py, PyAny>> {
65    let utils = pl_utils(py).bind(py);
66    match av {
67        AnyValue::UInt8(v) => v.into_bound_py_any(py),
68        AnyValue::UInt16(v) => v.into_bound_py_any(py),
69        AnyValue::UInt32(v) => v.into_bound_py_any(py),
70        AnyValue::UInt64(v) => v.into_bound_py_any(py),
71        AnyValue::Int8(v) => v.into_bound_py_any(py),
72        AnyValue::Int16(v) => v.into_bound_py_any(py),
73        AnyValue::Int32(v) => v.into_bound_py_any(py),
74        AnyValue::Int64(v) => v.into_bound_py_any(py),
75        AnyValue::Int128(v) => v.into_bound_py_any(py),
76        AnyValue::Float32(v) => v.into_bound_py_any(py),
77        AnyValue::Float64(v) => v.into_bound_py_any(py),
78        AnyValue::Null => py.None().into_bound_py_any(py),
79        AnyValue::Boolean(v) => v.into_bound_py_any(py),
80        AnyValue::String(v) => v.into_bound_py_any(py),
81        AnyValue::StringOwned(v) => v.into_bound_py_any(py),
82        AnyValue::Categorical(cat, map) | AnyValue::Enum(cat, map) => unsafe {
83            map.cat_to_str_unchecked(cat).into_bound_py_any(py)
84        },
85        AnyValue::CategoricalOwned(cat, map) | AnyValue::EnumOwned(cat, map) => unsafe {
86            map.cat_to_str_unchecked(cat).into_bound_py_any(py)
87        },
88        AnyValue::Date(v) => {
89            let date = date32_to_date(v);
90            date.into_bound_py_any(py)
91        },
92        AnyValue::Datetime(v, time_unit, time_zone) => {
93            datetime_to_py_object(py, v, time_unit, time_zone)
94        },
95        AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
96            datetime_to_py_object(py, v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
97        },
98        AnyValue::Duration(v, time_unit) => {
99            let time_delta = elapsed_offset_to_timedelta(v, time_unit);
100            time_delta.into_bound_py_any(py)
101        },
102        AnyValue::Time(v) => nanos_since_midnight_to_naivetime(v).into_bound_py_any(py),
103        AnyValue::Array(v, _) | AnyValue::List(v) => PySeries::new(v).to_list(py),
104        ref av @ AnyValue::Struct(_, _, flds) => {
105            Ok(struct_dict(py, av._iter_struct_av(), flds)?.into_any())
106        },
107        AnyValue::StructOwned(payload) => {
108            Ok(struct_dict(py, payload.0.into_iter(), &payload.1)?.into_any())
109        },
110        #[cfg(feature = "object")]
111        AnyValue::Object(v) => {
112            let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
113            Ok(object.inner.clone_ref(py).into_bound(py))
114        },
115        #[cfg(feature = "object")]
116        AnyValue::ObjectOwned(v) => {
117            let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
118            Ok(object.inner.clone_ref(py).into_bound(py))
119        },
120        AnyValue::Binary(v) => PyBytes::new(py, v).into_bound_py_any(py),
121        AnyValue::BinaryOwned(v) => PyBytes::new(py, &v).into_bound_py_any(py),
122        AnyValue::Decimal(v, scale) => {
123            let convert = utils.getattr(intern!(py, "to_py_decimal"))?;
124            const N: usize = 3;
125            let mut buf = [0_u128; N];
126            let n_digits = decimal_to_digits(v.abs(), &mut buf);
127            let buf = unsafe {
128                std::slice::from_raw_parts(
129                    buf.as_slice().as_ptr() as *const u8,
130                    N * size_of::<u128>(),
131                )
132            };
133            let digits = PyTuple::new(py, buf.iter().take(n_digits))?;
134            convert.call1((v.is_negative() as u8, digits, n_digits, -(scale as i32)))
135        },
136    }
137}
138
139/// Holds a Python type object and implements hashing / equality based on the pointer address of the
140/// type object. This is used as a hashtable key instead of only the `usize` pointer value, as we
141/// need to hold a ref to the Python type object to keep it alive.
142#[derive(Debug)]
143pub struct TypeObjectKey {
144    #[allow(unused)]
145    type_object: Py<PyType>,
146    /// We need to store this in a field for `Borrow<usize>`
147    address: usize,
148}
149
150impl TypeObjectKey {
151    fn new(type_object: Py<PyType>) -> Self {
152        let address = type_object.as_ptr() as usize;
153        Self {
154            type_object,
155            address,
156        }
157    }
158}
159
160impl PartialEq for TypeObjectKey {
161    fn eq(&self, other: &Self) -> bool {
162        self.address == other.address
163    }
164}
165
166impl Eq for TypeObjectKey {}
167
168impl std::borrow::Borrow<usize> for TypeObjectKey {
169    fn borrow(&self) -> &usize {
170        &self.address
171    }
172}
173
174impl std::hash::Hash for TypeObjectKey {
175    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
176        let v: &usize = self.borrow();
177        v.hash(state)
178    }
179}
180
181type InitFn = fn(&Bound<'_, PyAny>, bool) -> PyResult<AnyValue<'static>>;
182pub(crate) static LUT: Mutex<HashMap<TypeObjectKey, InitFn, PlFixedStateQuality>> =
183    Mutex::new(HashMap::with_hasher(PlFixedStateQuality::with_seed(0)));
184
185/// Convert a Python object to an [`AnyValue`].
186pub(crate) fn py_object_to_any_value(
187    ob: &Bound<'_, PyAny>,
188    strict: bool,
189    allow_object: bool,
190) -> PyResult<AnyValue<'static>> {
191    // Conversion functions.
192    fn get_null(_ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
193        Ok(AnyValue::Null)
194    }
195
196    fn get_bool(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
197        let b = ob.extract::<bool>()?;
198        Ok(AnyValue::Boolean(b))
199    }
200
201    fn get_int(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
202        if let Ok(v) = ob.extract::<i64>() {
203            Ok(AnyValue::Int64(v))
204        } else if let Ok(v) = ob.extract::<i128>() {
205            Ok(AnyValue::Int128(v))
206        } else if !strict {
207            let f = ob.extract::<f64>()?;
208            Ok(AnyValue::Float64(f))
209        } else {
210            Err(PyOverflowError::new_err(format!(
211                "int value too large for Polars integer types: {ob}"
212            )))
213        }
214    }
215
216    fn get_float(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
217        Ok(AnyValue::Float64(ob.extract::<f64>()?))
218    }
219
220    fn get_str(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
221        // Ideally we'd be returning an AnyValue::String(&str) instead, as was
222        // the case in previous versions of this function. However, if compiling
223        // with abi3 for versions older than Python 3.10, the APIs that purport
224        // to return &str actually just encode to UTF-8 as a newly allocated
225        // PyBytes object, and then return reference to that. So what we're
226        // doing here isn't any different fundamentally, and the APIs to for
227        // converting to &str are deprecated in PyO3 0.21.
228        //
229        // Once Python 3.10 is the minimum supported version, converting to &str
230        // will be cheaper, and we should do that. Python 3.9 security updates
231        // end-of-life is Oct 31, 2025.
232        Ok(AnyValue::StringOwned(ob.extract::<String>()?.into()))
233    }
234
235    fn get_bytes(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
236        let value = ob.extract::<Vec<u8>>()?;
237        Ok(AnyValue::BinaryOwned(value))
238    }
239
240    fn get_date(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
241        const UNIX_EPOCH: NaiveDate = DateTime::UNIX_EPOCH.naive_utc().date();
242        let date = ob.extract::<NaiveDate>()?;
243        let elapsed = date.signed_duration_since(UNIX_EPOCH);
244        Ok(AnyValue::Date(elapsed.num_days() as i32))
245    }
246
247    fn get_datetime(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
248        let py = ob.py();
249        let tzinfo = ob.getattr(intern!(py, "tzinfo"))?;
250
251        if tzinfo.is_none() {
252            let datetime = ob.extract::<NaiveDateTime>()?;
253            let delta = datetime - DateTime::UNIX_EPOCH.naive_utc();
254            let timestamp = delta.num_microseconds().unwrap();
255            return Ok(AnyValue::Datetime(timestamp, TimeUnit::Microseconds, None));
256        }
257
258        // Try converting `pytz` timezone to `zoneinfo` timezone
259        let (ob, tzinfo) = if let Some(tz) = tzinfo
260            .getattr(intern!(py, "zone"))
261            .ok()
262            .and_then(|tz| (!tz.is_none()).then_some(tz))
263        {
264            let tzinfo = PyTzInfo::timezone(py, tz.downcast_into::<PyString>()?)?;
265            (
266                &ob.call_method(intern!(py, "astimezone"), (&tzinfo,), None)?,
267                tzinfo,
268            )
269        } else {
270            (ob, tzinfo.downcast_into()?)
271        };
272
273        let (timestamp, tz) = if tzinfo.hasattr(intern!(py, "key"))? {
274            let datetime = ob.extract::<DateTime<Tz>>()?;
275            let tz = unsafe { TimeZone::from_static(datetime.timezone().name()) };
276            if datetime.year() >= 2100 {
277                // chrono-tz does not support dates after 2100
278                // https://github.com/chronotope/chrono-tz/issues/135
279                (
280                    pl_utils(py)
281                        .bind(py)
282                        .getattr(intern!(py, "datetime_to_int"))?
283                        .call1((ob, intern!(py, "us")))?
284                        .extract::<i64>()?,
285                    tz,
286                )
287            } else {
288                let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
289                (delta.num_microseconds().unwrap(), tz)
290            }
291        } else {
292            let datetime = ob.extract::<DateTime<FixedOffset>>()?;
293            let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
294            (delta.num_microseconds().unwrap(), TimeZone::UTC)
295        };
296
297        Ok(AnyValue::DatetimeOwned(
298            timestamp,
299            TimeUnit::Microseconds,
300            Some(Arc::new(tz)),
301        ))
302    }
303
304    fn get_timedelta(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
305        let timedelta = ob.extract::<TimeDelta>()?;
306        if let Some(micros) = timedelta.num_microseconds() {
307            Ok(AnyValue::Duration(micros, TimeUnit::Microseconds))
308        } else {
309            Ok(AnyValue::Duration(
310                timedelta.num_milliseconds(),
311                TimeUnit::Milliseconds,
312            ))
313        }
314    }
315
316    fn get_time(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
317        let time = ob.extract::<NaiveTime>()?;
318
319        Ok(AnyValue::Time(
320            (time.num_seconds_from_midnight() as i64) * 1_000_000_000 + time.nanosecond() as i64,
321        ))
322    }
323
324    fn get_decimal(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
325        fn abs_decimal_from_digits(
326            digits: impl IntoIterator<Item = u8>,
327            exp: i32,
328        ) -> Option<(i128, usize)> {
329            const MAX_ABS_DEC: i128 = 10_i128.pow(38) - 1;
330            let mut v = 0_i128;
331            for (i, d) in digits.into_iter().map(i128::from).enumerate() {
332                if i < 38 {
333                    v = v * 10 + d;
334                } else {
335                    v = v.checked_mul(10).and_then(|v| v.checked_add(d))?;
336                }
337            }
338            // We only support non-negative scale (=> non-positive exponent).
339            let scale = if exp > 0 {
340                // The decimal may be in a non-canonical representation, try to fix it first.
341                v = 10_i128
342                    .checked_pow(exp as u32)
343                    .and_then(|factor| v.checked_mul(factor))?;
344                0
345            } else {
346                (-exp) as usize
347            };
348            // TODO: Do we care for checking if it fits in MAX_ABS_DEC? (if we set precision to None anyway?)
349            (v <= MAX_ABS_DEC).then_some((v, scale))
350        }
351
352        // Note: Using Vec<u8> is not the most efficient thing here (input is a tuple)
353        let (sign, digits, exp): (i8, Vec<u8>, i32) = ob
354            .call_method0(intern!(ob.py(), "as_tuple"))
355            .unwrap()
356            .extract()
357            .unwrap();
358        let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
359            PyErr::from(PyPolarsErr::Other(
360                "Decimal is too large to fit in Decimal128".into(),
361            ))
362        })?;
363        if sign > 0 {
364            v = -v; // Won't overflow since -i128::MAX > i128::MIN
365        }
366        Ok(AnyValue::Decimal(v, scale))
367    }
368
369    fn get_list(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
370        fn get_list_with_constructor(
371            ob: &Bound<'_, PyAny>,
372            strict: bool,
373        ) -> PyResult<AnyValue<'static>> {
374            // Use the dedicated constructor.
375            // This constructor is able to go via dedicated type constructors
376            // so it can be much faster.
377            let py = ob.py();
378            let kwargs = PyDict::new(py);
379            kwargs.set_item("strict", strict)?;
380            let s = pl_series(py).call(py, (ob,), Some(&kwargs))?;
381            get_list_from_series(s.bind(py), strict)
382        }
383
384        if ob.is_empty()? {
385            Ok(AnyValue::List(Series::new_empty(
386                PlSmallStr::EMPTY,
387                &DataType::Null,
388            )))
389        } else if ob.is_instance_of::<PyList>() | ob.is_instance_of::<PyTuple>() {
390            let list = ob.downcast::<PySequence>()?;
391
392            // Try to find first non-null.
393            let length = list.len()?;
394            let mut iter = list.try_iter()?;
395            let mut avs = Vec::new();
396            for item in &mut iter {
397                let av = py_object_to_any_value(&item?, strict, true)?;
398                let is_null = av.is_null();
399                avs.push(av);
400                if is_null {
401                    break;
402                }
403            }
404
405            // Try to use a faster converter.
406            if let Some(av) = avs.last()
407                && !av.is_null()
408                && av.dtype().is_primitive()
409            {
410                // Always use strict, we will filter the error if we're not
411                // strict and try again using a slower converter with supertype.
412                match get_list_with_constructor(ob, true) {
413                    Ok(ret) => return Ok(ret),
414                    Err(e) => {
415                        if strict {
416                            return Err(e);
417                        }
418                    },
419                }
420            }
421
422            // Push the rest of the anyvalues and use slower converter.
423            avs.reserve(length);
424            for item in &mut iter {
425                avs.push(py_object_to_any_value(&item?, strict, true)?);
426            }
427
428            let (dtype, _n_dtypes) = any_values_to_supertype_and_n_dtypes(&avs)
429                .map_err(|e| PyTypeError::new_err(e.to_string()))?;
430            let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, strict)
431                .map_err(|e| {
432                    PyTypeError::new_err(format!(
433                        "{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types."
434                    ))
435                })?;
436            Ok(AnyValue::List(s))
437        } else {
438            // range will take this branch
439            get_list_with_constructor(ob, strict)
440        }
441    }
442
443    fn get_list_from_series(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
444        let s = super::get_series(ob)?;
445        Ok(AnyValue::List(s))
446    }
447
448    fn get_mapping(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
449        let mapping = ob.downcast::<PyMapping>()?;
450        let len = mapping.len()?;
451        let mut keys = Vec::with_capacity(len);
452        let mut vals = Vec::with_capacity(len);
453
454        for item in mapping.items()?.try_iter()? {
455            let item = item?.downcast_into::<PyTuple>()?;
456            let (key_py, val_py) = (item.get_item(0)?, item.get_item(1)?);
457
458            let key: Cow<str> = key_py.extract()?;
459            let val = py_object_to_any_value(&val_py, strict, true)?;
460
461            keys.push(Field::new(key.as_ref().into(), val.dtype()));
462            vals.push(val);
463        }
464        Ok(AnyValue::StructOwned(Box::new((vals, keys))))
465    }
466
467    fn get_struct(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
468        let dict = ob.downcast::<PyDict>().unwrap();
469        let len = dict.len();
470        let mut keys = Vec::with_capacity(len);
471        let mut vals = Vec::with_capacity(len);
472        for (k, v) in dict.into_iter() {
473            let key = k.extract::<Cow<str>>()?;
474            let val = py_object_to_any_value(&v, strict, true)?;
475            let dtype = val.dtype();
476            keys.push(Field::new(key.as_ref().into(), dtype));
477            vals.push(val)
478        }
479        Ok(AnyValue::StructOwned(Box::new((vals, keys))))
480    }
481
482    fn get_object(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
483        #[cfg(feature = "object")]
484        {
485            // This is slow, but hey don't use objects.
486            let v = &ObjectValue {
487                inner: ob.clone().unbind(),
488            };
489            Ok(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))
490        }
491        #[cfg(not(feature = "object"))]
492        panic!("activate object")
493    }
494
495    /// Determine which conversion function to use for the given object.
496    ///
497    /// Note: This function is only ran if the object's type is not already in the
498    /// lookup table.
499    fn get_conversion_function(ob: &Bound<'_, PyAny>, allow_object: bool) -> PyResult<InitFn> {
500        let py = ob.py();
501        if ob.is_none() {
502            Ok(get_null)
503        }
504        // bool must be checked before int because Python bool is an instance of int.
505        else if ob.is_instance_of::<PyBool>() {
506            Ok(get_bool)
507        } else if ob.is_instance_of::<PyInt>() {
508            Ok(get_int)
509        } else if ob.is_instance_of::<PyFloat>() {
510            Ok(get_float)
511        } else if ob.is_instance_of::<PyString>() {
512            Ok(get_str)
513        } else if ob.is_instance_of::<PyBytes>() {
514            Ok(get_bytes)
515        } else if ob.is_instance_of::<PyList>() || ob.is_instance_of::<PyTuple>() {
516            Ok(get_list)
517        } else if ob.is_instance_of::<PyDict>() {
518            Ok(get_struct)
519        } else if PyMapping::type_check(ob) {
520            Ok(get_mapping)
521        }
522        // datetime must be checked before date because
523        // Python datetime is an instance of date.
524        else if PyDateTime::type_check(ob) {
525            Ok(get_datetime as InitFn)
526        } else if PyDate::type_check(ob) {
527            Ok(get_date as InitFn)
528        } else if PyTime::type_check(ob) {
529            Ok(get_time as InitFn)
530        } else if PyDelta::type_check(ob) {
531            Ok(get_timedelta as InitFn)
532        } else if ob.is_instance_of::<PyRange>() {
533            Ok(get_list as InitFn)
534        } else {
535            static DECIMAL_TYPE: GILOnceCell<Py<PyType>> = GILOnceCell::new();
536            if ob.is_instance(DECIMAL_TYPE.import(py, "decimal", "Decimal")?)? {
537                return Ok(get_decimal as InitFn);
538            }
539
540            // Support NumPy scalars.
541            if ob.extract::<i64>().is_ok() || ob.extract::<u64>().is_ok() {
542                return Ok(get_int as InitFn);
543            } else if ob.extract::<f64>().is_ok() {
544                return Ok(get_float as InitFn);
545            }
546
547            if allow_object {
548                Ok(get_object as InitFn)
549            } else {
550                Err(PyValueError::new_err(format!("Cannot convert {ob}")))
551            }
552        }
553    }
554
555    let py_type = ob.get_type();
556    let py_type_address = py_type.as_ptr() as usize;
557
558    let conversion_func = {
559        if let Some(cached_func) = LUT.lock().unwrap().get(&py_type_address) {
560            *cached_func
561        } else {
562            let k = TypeObjectKey::new(py_type.clone().unbind());
563            assert_eq!(k.address, py_type_address);
564
565            let func = get_conversion_function(ob, allow_object)?;
566            LUT.lock().unwrap().insert(k, func);
567            func
568        }
569    };
570
571    conversion_func(ob, strict)
572}