polars_python/conversion/
any_value.rs

1use std::borrow::{Borrow, Cow};
2
3use chrono::{
4    DateTime, Datelike, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike,
5};
6use chrono_tz::Tz;
7#[cfg(feature = "object")]
8use polars::chunked_array::object::PolarsObjectSafe;
9#[cfg(feature = "object")]
10use polars::datatypes::OwnedObject;
11use polars::datatypes::{DataType, Field, PlHashMap, TimeUnit};
12use polars::prelude::{AnyValue, PlSmallStr, Series};
13use polars_core::utils::any_values_to_supertype_and_n_dtypes;
14use polars_core::utils::arrow::temporal_conversions::date32_to_date;
15use pyo3::exceptions::{PyOverflowError, PyTypeError, PyValueError};
16use pyo3::prelude::*;
17use pyo3::types::{
18    PyBool, PyBytes, PyDict, PyFloat, PyInt, PyList, PySequence, PyString, PyTuple, PyType,
19};
20use pyo3::{intern, IntoPyObjectExt};
21
22use super::datetime::{
23    datetime_to_py_object, elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime,
24};
25use super::{decimal_to_digits, struct_dict, ObjectValue, Wrap};
26use crate::error::PyPolarsErr;
27use crate::py_modules::{pl_series, pl_utils};
28use crate::series::PySeries;
29
30impl<'py> IntoPyObject<'py> for Wrap<AnyValue<'_>> {
31    type Target = PyAny;
32    type Output = Bound<'py, Self::Target>;
33    type Error = PyErr;
34
35    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
36        any_value_into_py_object(self.0, py)
37    }
38}
39
40impl<'py> IntoPyObject<'py> for &Wrap<AnyValue<'_>> {
41    type Target = PyAny;
42    type Output = Bound<'py, Self::Target>;
43    type Error = PyErr;
44
45    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
46        self.clone().into_pyobject(py)
47    }
48}
49
50impl<'py> FromPyObject<'py> for Wrap<AnyValue<'py>> {
51    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
52        py_object_to_any_value(ob, true, true).map(Wrap)
53    }
54}
55
56pub(crate) fn any_value_into_py_object<'py>(
57    av: AnyValue,
58    py: Python<'py>,
59) -> PyResult<Bound<'py, PyAny>> {
60    let utils = pl_utils(py).bind(py);
61    match av {
62        AnyValue::UInt8(v) => v.into_bound_py_any(py),
63        AnyValue::UInt16(v) => v.into_bound_py_any(py),
64        AnyValue::UInt32(v) => v.into_bound_py_any(py),
65        AnyValue::UInt64(v) => v.into_bound_py_any(py),
66        AnyValue::Int8(v) => v.into_bound_py_any(py),
67        AnyValue::Int16(v) => v.into_bound_py_any(py),
68        AnyValue::Int32(v) => v.into_bound_py_any(py),
69        AnyValue::Int64(v) => v.into_bound_py_any(py),
70        AnyValue::Int128(v) => v.into_bound_py_any(py),
71        AnyValue::Float32(v) => v.into_bound_py_any(py),
72        AnyValue::Float64(v) => v.into_bound_py_any(py),
73        AnyValue::Null => py.None().into_bound_py_any(py),
74        AnyValue::Boolean(v) => v.into_bound_py_any(py),
75        AnyValue::String(v) => v.into_bound_py_any(py),
76        AnyValue::StringOwned(v) => v.into_bound_py_any(py),
77        AnyValue::Categorical(idx, rev, arr) | AnyValue::Enum(idx, rev, arr) => {
78            let s = if arr.is_null() {
79                rev.get(idx)
80            } else {
81                unsafe { arr.deref_unchecked().value(idx as usize) }
82            };
83            s.into_bound_py_any(py)
84        },
85        AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => {
86            let s = if arr.is_null() {
87                rev.get(idx)
88            } else {
89                unsafe { arr.deref_unchecked().value(idx as usize) }
90            };
91            s.into_bound_py_any(py)
92        },
93        AnyValue::Date(v) => {
94            let date = date32_to_date(v);
95            date.into_bound_py_any(py)
96        },
97        AnyValue::Datetime(v, time_unit, time_zone) => {
98            datetime_to_py_object(py, v, time_unit, time_zone)
99        },
100        AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
101            datetime_to_py_object(py, v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
102        },
103        AnyValue::Duration(v, time_unit) => {
104            let time_delta = elapsed_offset_to_timedelta(v, time_unit);
105            time_delta.into_bound_py_any(py)
106        },
107        AnyValue::Time(v) => nanos_since_midnight_to_naivetime(v).into_bound_py_any(py),
108        AnyValue::Array(v, _) | AnyValue::List(v) => PySeries::new(v).to_list(py),
109        ref av @ AnyValue::Struct(_, _, flds) => {
110            Ok(struct_dict(py, av._iter_struct_av(), flds)?.into_any())
111        },
112        AnyValue::StructOwned(payload) => {
113            Ok(struct_dict(py, payload.0.into_iter(), &payload.1)?.into_any())
114        },
115        #[cfg(feature = "object")]
116        AnyValue::Object(v) => {
117            let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
118            Ok(object.inner.clone_ref(py).into_bound(py))
119        },
120        #[cfg(feature = "object")]
121        AnyValue::ObjectOwned(v) => {
122            let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
123            Ok(object.inner.clone_ref(py).into_bound(py))
124        },
125        AnyValue::Binary(v) => PyBytes::new(py, v).into_bound_py_any(py),
126        AnyValue::BinaryOwned(v) => PyBytes::new(py, &v).into_bound_py_any(py),
127        AnyValue::Decimal(v, scale) => {
128            let convert = utils.getattr(intern!(py, "to_py_decimal"))?;
129            const N: usize = 3;
130            let mut buf = [0_u128; N];
131            let n_digits = decimal_to_digits(v.abs(), &mut buf);
132            let buf = unsafe {
133                std::slice::from_raw_parts(
134                    buf.as_slice().as_ptr() as *const u8,
135                    N * size_of::<u128>(),
136                )
137            };
138            let digits = PyTuple::new(py, buf.iter().take(n_digits))?;
139            convert.call1((v.is_negative() as u8, digits, n_digits, -(scale as i32)))
140        },
141    }
142}
143
144/// Holds a Python type object and implements hashing / equality based on the pointer address of the
145/// type object. This is used as a hashtable key instead of only the `usize` pointer value, as we
146/// need to hold a ref to the Python type object to keep it alive.
147#[derive(Debug)]
148pub struct TypeObjectKey {
149    #[allow(unused)]
150    type_object: Py<PyType>,
151    /// We need to store this in a field for `Borrow<usize>`
152    address: usize,
153}
154
155impl TypeObjectKey {
156    fn new(type_object: Py<PyType>) -> Self {
157        let address = type_object.as_ptr() as usize;
158        Self {
159            type_object,
160            address,
161        }
162    }
163}
164
165impl PartialEq for TypeObjectKey {
166    fn eq(&self, other: &Self) -> bool {
167        self.address == other.address
168    }
169}
170
171impl Eq for TypeObjectKey {}
172
173impl std::borrow::Borrow<usize> for TypeObjectKey {
174    fn borrow(&self) -> &usize {
175        &self.address
176    }
177}
178
179impl std::hash::Hash for TypeObjectKey {
180    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
181        let v: &usize = self.borrow();
182        v.hash(state)
183    }
184}
185
186type InitFn = for<'py> fn(&Bound<'py, PyAny>, bool) -> PyResult<AnyValue<'py>>;
187pub(crate) static LUT: crate::gil_once_cell::GILOnceCell<PlHashMap<TypeObjectKey, InitFn>> =
188    crate::gil_once_cell::GILOnceCell::new();
189
190/// Convert a Python object to an [`AnyValue`].
191pub(crate) fn py_object_to_any_value<'py>(
192    ob: &Bound<'py, PyAny>,
193    strict: bool,
194    allow_object: bool,
195) -> PyResult<AnyValue<'py>> {
196    // Conversion functions.
197    fn get_null(_ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
198        Ok(AnyValue::Null)
199    }
200
201    fn get_bool(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
202        let b = ob.extract::<bool>()?;
203        Ok(AnyValue::Boolean(b))
204    }
205
206    fn get_int(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
207        if let Ok(v) = ob.extract::<i64>() {
208            Ok(AnyValue::Int64(v))
209        } else if let Ok(v) = ob.extract::<i128>() {
210            Ok(AnyValue::Int128(v))
211        } else if !strict {
212            let f = ob.extract::<f64>()?;
213            Ok(AnyValue::Float64(f))
214        } else {
215            Err(PyOverflowError::new_err(format!(
216                "int value too large for Polars integer types: {ob}"
217            )))
218        }
219    }
220
221    fn get_float(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
222        Ok(AnyValue::Float64(ob.extract::<f64>()?))
223    }
224
225    fn get_str(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
226        // Ideally we'd be returning an AnyValue::String(&str) instead, as was
227        // the case in previous versions of this function. However, if compiling
228        // with abi3 for versions older than Python 3.10, the APIs that purport
229        // to return &str actually just encode to UTF-8 as a newly allocated
230        // PyBytes object, and then return reference to that. So what we're
231        // doing here isn't any different fundamentally, and the APIs to for
232        // converting to &str are deprecated in PyO3 0.21.
233        //
234        // Once Python 3.10 is the minimum supported version, converting to &str
235        // will be cheaper, and we should do that. Python 3.9 security updates
236        // end-of-life is Oct 31, 2025.
237        Ok(AnyValue::StringOwned(ob.extract::<String>()?.into()))
238    }
239
240    fn get_bytes<'py>(ob: &Bound<'py, PyAny>, _strict: bool) -> PyResult<AnyValue<'py>> {
241        let value = ob.extract::<Vec<u8>>()?;
242        Ok(AnyValue::BinaryOwned(value))
243    }
244
245    fn get_date(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
246        const UNIX_EPOCH: NaiveDate = NaiveDateTime::UNIX_EPOCH.date();
247        let date = ob.extract::<NaiveDate>()?;
248        let elapsed = date.signed_duration_since(UNIX_EPOCH);
249        Ok(AnyValue::Date(elapsed.num_days() as i32))
250    }
251
252    fn get_datetime(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
253        let py = ob.py();
254        let tzinfo = ob.getattr(intern!(py, "tzinfo"))?;
255
256        let timestamp = if tzinfo.is_none() {
257            let datetime = ob.extract::<NaiveDateTime>()?;
258            let delta = datetime - NaiveDateTime::UNIX_EPOCH;
259            delta.num_microseconds().unwrap()
260        } else if tzinfo.hasattr(intern!(py, "key"))? {
261            let datetime = ob.extract::<DateTime<Tz>>()?;
262            if datetime.year() >= 2100 {
263                // chrono-tz does not support dates after 2100
264                // https://github.com/chronotope/chrono-tz/issues/135
265                pl_utils(py)
266                    .bind(py)
267                    .getattr(intern!(py, "datetime_to_int"))?
268                    .call1((ob, intern!(py, "us")))?
269                    .extract::<i64>()?
270            } else {
271                let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
272                delta.num_microseconds().unwrap()
273            }
274        } else {
275            let datetime = ob.extract::<DateTime<FixedOffset>>()?;
276            let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
277            delta.num_microseconds().unwrap()
278        };
279
280        Ok(AnyValue::Datetime(timestamp, TimeUnit::Microseconds, None))
281    }
282
283    fn get_timedelta(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
284        let timedelta = ob.extract::<TimeDelta>()?;
285        if let Some(micros) = timedelta.num_microseconds() {
286            Ok(AnyValue::Duration(micros, TimeUnit::Microseconds))
287        } else {
288            Ok(AnyValue::Duration(
289                timedelta.num_milliseconds(),
290                TimeUnit::Milliseconds,
291            ))
292        }
293    }
294
295    fn get_time(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
296        let time = ob.extract::<NaiveTime>()?;
297
298        Ok(AnyValue::Time(
299            (time.num_seconds_from_midnight() as i64) * 1_000_000_000 + time.nanosecond() as i64,
300        ))
301    }
302
303    fn get_decimal(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
304        fn abs_decimal_from_digits(
305            digits: impl IntoIterator<Item = u8>,
306            exp: i32,
307        ) -> Option<(i128, usize)> {
308            const MAX_ABS_DEC: i128 = 10_i128.pow(38) - 1;
309            let mut v = 0_i128;
310            for (i, d) in digits.into_iter().map(i128::from).enumerate() {
311                if i < 38 {
312                    v = v * 10 + d;
313                } else {
314                    v = v.checked_mul(10).and_then(|v| v.checked_add(d))?;
315                }
316            }
317            // We only support non-negative scale (=> non-positive exponent).
318            let scale = if exp > 0 {
319                // The decimal may be in a non-canonical representation, try to fix it first.
320                v = 10_i128
321                    .checked_pow(exp as u32)
322                    .and_then(|factor| v.checked_mul(factor))?;
323                0
324            } else {
325                (-exp) as usize
326            };
327            // TODO: Do we care for checking if it fits in MAX_ABS_DEC? (if we set precision to None anyway?)
328            (v <= MAX_ABS_DEC).then_some((v, scale))
329        }
330
331        // Note: Using Vec<u8> is not the most efficient thing here (input is a tuple)
332        let (sign, digits, exp): (i8, Vec<u8>, i32) = ob
333            .call_method0(intern!(ob.py(), "as_tuple"))
334            .unwrap()
335            .extract()
336            .unwrap();
337        let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
338            PyErr::from(PyPolarsErr::Other(
339                "Decimal is too large to fit in Decimal128".into(),
340            ))
341        })?;
342        if sign > 0 {
343            v = -v; // Won't overflow since -i128::MAX > i128::MIN
344        }
345        Ok(AnyValue::Decimal(v, scale))
346    }
347
348    fn get_list(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
349        fn get_list_with_constructor(
350            ob: &Bound<'_, PyAny>,
351            strict: bool,
352        ) -> PyResult<AnyValue<'static>> {
353            // Use the dedicated constructor.
354            // This constructor is able to go via dedicated type constructors
355            // so it can be much faster.
356            let py = ob.py();
357            let kwargs = PyDict::new(py);
358            kwargs.set_item("strict", strict)?;
359            let s = pl_series(py).call(py, (ob,), Some(&kwargs))?;
360            get_list_from_series(s.bind(py), strict)
361        }
362
363        if ob.is_empty()? {
364            Ok(AnyValue::List(Series::new_empty(
365                PlSmallStr::EMPTY,
366                &DataType::Null,
367            )))
368        } else if ob.is_instance_of::<PyList>() | ob.is_instance_of::<PyTuple>() {
369            const INFER_SCHEMA_LENGTH: usize = 25;
370
371            let list = ob.downcast::<PySequence>()?;
372
373            let mut avs = Vec::with_capacity(INFER_SCHEMA_LENGTH);
374            let mut iter = list.try_iter()?;
375            let mut items = Vec::with_capacity(INFER_SCHEMA_LENGTH);
376            for item in (&mut iter).take(INFER_SCHEMA_LENGTH) {
377                items.push(item?);
378                let av = py_object_to_any_value(items.last().unwrap(), strict, true)?;
379                avs.push(av)
380            }
381            let (dtype, n_dtypes) = any_values_to_supertype_and_n_dtypes(&avs)
382                .map_err(|e| PyTypeError::new_err(e.to_string()))?;
383
384            // This path is only taken if there is no question about the data type.
385            if dtype.is_primitive() && n_dtypes == 1 {
386                get_list_with_constructor(ob, strict)
387            } else {
388                // Push the rest.
389                let length = list.len()?;
390                avs.reserve(length);
391                let mut rest = Vec::with_capacity(length);
392                for item in iter {
393                    rest.push(item?);
394                    let av = py_object_to_any_value(rest.last().unwrap(), strict, true)?;
395                    avs.push(av)
396                }
397
398                let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, strict)
399                    .map_err(|e| {
400                        PyTypeError::new_err(format!(
401                            "{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types."
402                        ))
403                    })?;
404                Ok(AnyValue::List(s))
405            }
406        } else {
407            // range will take this branch
408            get_list_with_constructor(ob, strict)
409        }
410    }
411
412    fn get_list_from_series(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
413        let s = super::get_series(ob)?;
414        Ok(AnyValue::List(s))
415    }
416
417    fn get_struct<'py>(ob: &Bound<'py, PyAny>, strict: bool) -> PyResult<AnyValue<'py>> {
418        let dict = ob.downcast::<PyDict>().unwrap();
419        let len = dict.len();
420        let mut keys = Vec::with_capacity(len);
421        let mut vals = Vec::with_capacity(len);
422        for (k, v) in dict.into_iter() {
423            let key = k.extract::<Cow<str>>()?;
424            let val = py_object_to_any_value(&v, strict, true)?;
425            let dtype = val.dtype();
426            keys.push(Field::new(key.as_ref().into(), dtype));
427            vals.push(val)
428        }
429        Ok(AnyValue::StructOwned(Box::new((vals, keys))))
430    }
431
432    fn get_object(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
433        #[cfg(feature = "object")]
434        {
435            // This is slow, but hey don't use objects.
436            let v = &ObjectValue {
437                inner: ob.clone().unbind(),
438            };
439            Ok(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))
440        }
441        #[cfg(not(feature = "object"))]
442        panic!("activate object")
443    }
444
445    /// Determine which conversion function to use for the given object.
446    ///
447    /// Note: This function is only ran if the object's type is not already in the
448    /// lookup table.
449    fn get_conversion_function(
450        ob: &Bound<'_, PyAny>,
451        py: Python<'_>,
452        allow_object: bool,
453    ) -> PyResult<InitFn> {
454        if ob.is_none() {
455            Ok(get_null)
456        }
457        // bool must be checked before int because Python bool is an instance of int.
458        else if ob.is_instance_of::<PyBool>() {
459            Ok(get_bool)
460        } else if ob.is_instance_of::<PyInt>() {
461            Ok(get_int)
462        } else if ob.is_instance_of::<PyFloat>() {
463            Ok(get_float)
464        } else if ob.is_instance_of::<PyString>() {
465            Ok(get_str)
466        } else if ob.is_instance_of::<PyBytes>() {
467            Ok(get_bytes)
468        } else if ob.is_instance_of::<PyList>() || ob.is_instance_of::<PyTuple>() {
469            Ok(get_list)
470        } else if ob.is_instance_of::<PyDict>() {
471            Ok(get_struct)
472        } else {
473            let ob_type = ob.get_type();
474            let type_name = ob_type.fully_qualified_name()?.to_string();
475            match type_name.as_str() {
476                // Can't use pyo3::types::PyDateTime with abi3-py37 feature,
477                // so need this workaround instead of `isinstance(ob, datetime)`.
478                "datetime.date" => Ok(get_date as InitFn),
479                "datetime.time" => Ok(get_time as InitFn),
480                "datetime.datetime" => Ok(get_datetime as InitFn),
481                "datetime.timedelta" => Ok(get_timedelta as InitFn),
482                "decimal.Decimal" => Ok(get_decimal as InitFn),
483                "range" => Ok(get_list as InitFn),
484                _ => {
485                    // Support NumPy scalars.
486                    if ob.extract::<i64>().is_ok() || ob.extract::<u64>().is_ok() {
487                        return Ok(get_int as InitFn);
488                    } else if ob.extract::<f64>().is_ok() {
489                        return Ok(get_float as InitFn);
490                    }
491
492                    // Support custom subclasses of datetime/date.
493                    let ancestors = ob_type.getattr(intern!(py, "__mro__"))?;
494                    let ancestors_str_iter = ancestors
495                        .try_iter()?
496                        .map(|b| b.unwrap().str().unwrap().to_string());
497                    for c in ancestors_str_iter {
498                        match &*c {
499                            // datetime must be checked before date because
500                            // Python datetime is an instance of date.
501                            "<class 'datetime.datetime'>" => {
502                                return Ok(get_datetime as InitFn);
503                            },
504                            "<class 'datetime.date'>" => return Ok(get_date as InitFn),
505                            "<class 'datetime.timedelta'>" => return Ok(get_timedelta as InitFn),
506                            "<class 'datetime.time'>" => return Ok(get_time as InitFn),
507                            _ => (),
508                        }
509                    }
510
511                    if allow_object {
512                        Ok(get_object as InitFn)
513                    } else {
514                        Err(PyValueError::new_err(format!("Cannot convert {ob}")))
515                    }
516                },
517            }
518        }
519    }
520
521    let py_type = ob.get_type();
522    let py_type_address = py_type.as_ptr() as usize;
523
524    Python::with_gil(move |py| {
525        LUT.with_gil(py, move |lut| {
526            if !lut.contains_key(&py_type_address) {
527                let k = TypeObjectKey::new(py_type.clone().unbind());
528
529                assert_eq!(k.address, py_type_address);
530
531                unsafe {
532                    lut.insert_unique_unchecked(k, get_conversion_function(ob, py, allow_object)?);
533                }
534            }
535
536            let conversion_func = lut.get(&py_type_address).unwrap();
537            conversion_func(ob, strict)
538        })
539    })
540}