polars_python/conversion/
any_value.rs

1use std::borrow::{Borrow, Cow};
2use std::sync::{Arc, Mutex};
3
4use chrono::{
5    DateTime, Datelike, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike,
6};
7use chrono_tz::Tz;
8use hashbrown::HashMap;
9#[cfg(feature = "object")]
10use polars::chunked_array::object::PolarsObjectSafe;
11#[cfg(feature = "object")]
12use polars::datatypes::OwnedObject;
13use polars::datatypes::{DataType, Field, TimeUnit};
14use polars::prelude::{AnyValue, PlSmallStr, Series};
15use polars_core::utils::any_values_to_supertype_and_n_dtypes;
16use polars_core::utils::arrow::temporal_conversions::date32_to_date;
17use polars_utils::aliases::PlFixedStateQuality;
18use pyo3::exceptions::{PyOverflowError, PyTypeError, PyValueError};
19use pyo3::prelude::*;
20use pyo3::pybacked::PyBackedStr;
21use pyo3::types::{
22    PyBool, PyBytes, PyDict, PyFloat, PyInt, PyList, PySequence, PyString, PyTuple, PyType,
23};
24use pyo3::{IntoPyObjectExt, intern};
25
26use super::datetime::{
27    datetime_to_py_object, elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime,
28};
29use super::{ObjectValue, Wrap, decimal_to_digits, struct_dict};
30use crate::error::PyPolarsErr;
31use crate::py_modules::{pl_series, pl_utils};
32use crate::series::PySeries;
33
34impl<'py> IntoPyObject<'py> for Wrap<AnyValue<'_>> {
35    type Target = PyAny;
36    type Output = Bound<'py, Self::Target>;
37    type Error = PyErr;
38
39    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
40        any_value_into_py_object(self.0, py)
41    }
42}
43
44impl<'py> IntoPyObject<'py> for &Wrap<AnyValue<'_>> {
45    type Target = PyAny;
46    type Output = Bound<'py, Self::Target>;
47    type Error = PyErr;
48
49    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
50        self.clone().into_pyobject(py)
51    }
52}
53
54impl<'py> FromPyObject<'py> for Wrap<AnyValue<'py>> {
55    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
56        py_object_to_any_value(ob, true, true).map(Wrap)
57    }
58}
59
60pub(crate) fn any_value_into_py_object<'py>(
61    av: AnyValue,
62    py: Python<'py>,
63) -> PyResult<Bound<'py, PyAny>> {
64    let utils = pl_utils(py).bind(py);
65    match av {
66        AnyValue::UInt8(v) => v.into_bound_py_any(py),
67        AnyValue::UInt16(v) => v.into_bound_py_any(py),
68        AnyValue::UInt32(v) => v.into_bound_py_any(py),
69        AnyValue::UInt64(v) => v.into_bound_py_any(py),
70        AnyValue::Int8(v) => v.into_bound_py_any(py),
71        AnyValue::Int16(v) => v.into_bound_py_any(py),
72        AnyValue::Int32(v) => v.into_bound_py_any(py),
73        AnyValue::Int64(v) => v.into_bound_py_any(py),
74        AnyValue::Int128(v) => v.into_bound_py_any(py),
75        AnyValue::Float32(v) => v.into_bound_py_any(py),
76        AnyValue::Float64(v) => v.into_bound_py_any(py),
77        AnyValue::Null => py.None().into_bound_py_any(py),
78        AnyValue::Boolean(v) => v.into_bound_py_any(py),
79        AnyValue::String(v) => v.into_bound_py_any(py),
80        AnyValue::StringOwned(v) => v.into_bound_py_any(py),
81        AnyValue::Categorical(idx, rev, arr) | AnyValue::Enum(idx, rev, arr) => {
82            let s = if arr.is_null() {
83                rev.get(idx)
84            } else {
85                unsafe { arr.deref_unchecked().value(idx as usize) }
86            };
87            s.into_bound_py_any(py)
88        },
89        AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => {
90            let s = if arr.is_null() {
91                rev.get(idx)
92            } else {
93                unsafe { arr.deref_unchecked().value(idx as usize) }
94            };
95            s.into_bound_py_any(py)
96        },
97        AnyValue::Date(v) => {
98            let date = date32_to_date(v);
99            date.into_bound_py_any(py)
100        },
101        AnyValue::Datetime(v, time_unit, time_zone) => {
102            datetime_to_py_object(py, v, time_unit, time_zone)
103        },
104        AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
105            datetime_to_py_object(py, v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
106        },
107        AnyValue::Duration(v, time_unit) => {
108            let time_delta = elapsed_offset_to_timedelta(v, time_unit);
109            time_delta.into_bound_py_any(py)
110        },
111        AnyValue::Time(v) => nanos_since_midnight_to_naivetime(v).into_bound_py_any(py),
112        AnyValue::Array(v, _) | AnyValue::List(v) => PySeries::new(v).to_list(py),
113        ref av @ AnyValue::Struct(_, _, flds) => {
114            Ok(struct_dict(py, av._iter_struct_av(), flds)?.into_any())
115        },
116        AnyValue::StructOwned(payload) => {
117            Ok(struct_dict(py, payload.0.into_iter(), &payload.1)?.into_any())
118        },
119        #[cfg(feature = "object")]
120        AnyValue::Object(v) => {
121            let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
122            Ok(object.inner.clone_ref(py).into_bound(py))
123        },
124        #[cfg(feature = "object")]
125        AnyValue::ObjectOwned(v) => {
126            let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
127            Ok(object.inner.clone_ref(py).into_bound(py))
128        },
129        AnyValue::Binary(v) => PyBytes::new(py, v).into_bound_py_any(py),
130        AnyValue::BinaryOwned(v) => PyBytes::new(py, &v).into_bound_py_any(py),
131        AnyValue::Decimal(v, scale) => {
132            let convert = utils.getattr(intern!(py, "to_py_decimal"))?;
133            const N: usize = 3;
134            let mut buf = [0_u128; N];
135            let n_digits = decimal_to_digits(v.abs(), &mut buf);
136            let buf = unsafe {
137                std::slice::from_raw_parts(
138                    buf.as_slice().as_ptr() as *const u8,
139                    N * size_of::<u128>(),
140                )
141            };
142            let digits = PyTuple::new(py, buf.iter().take(n_digits))?;
143            convert.call1((v.is_negative() as u8, digits, n_digits, -(scale as i32)))
144        },
145    }
146}
147
148/// Holds a Python type object and implements hashing / equality based on the pointer address of the
149/// type object. This is used as a hashtable key instead of only the `usize` pointer value, as we
150/// need to hold a ref to the Python type object to keep it alive.
151#[derive(Debug)]
152pub struct TypeObjectKey {
153    #[allow(unused)]
154    type_object: Py<PyType>,
155    /// We need to store this in a field for `Borrow<usize>`
156    address: usize,
157}
158
159impl TypeObjectKey {
160    fn new(type_object: Py<PyType>) -> Self {
161        let address = type_object.as_ptr() as usize;
162        Self {
163            type_object,
164            address,
165        }
166    }
167}
168
169impl PartialEq for TypeObjectKey {
170    fn eq(&self, other: &Self) -> bool {
171        self.address == other.address
172    }
173}
174
175impl Eq for TypeObjectKey {}
176
177impl std::borrow::Borrow<usize> for TypeObjectKey {
178    fn borrow(&self) -> &usize {
179        &self.address
180    }
181}
182
183impl std::hash::Hash for TypeObjectKey {
184    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
185        let v: &usize = self.borrow();
186        v.hash(state)
187    }
188}
189
190type InitFn = for<'py> fn(&Bound<'py, PyAny>, bool) -> PyResult<AnyValue<'py>>;
191pub(crate) static LUT: Mutex<HashMap<TypeObjectKey, InitFn, PlFixedStateQuality>> =
192    Mutex::new(HashMap::with_hasher(PlFixedStateQuality::with_seed(0)));
193
194/// Convert a Python object to an [`AnyValue`].
195pub(crate) fn py_object_to_any_value<'py>(
196    ob: &Bound<'py, PyAny>,
197    strict: bool,
198    allow_object: bool,
199) -> PyResult<AnyValue<'py>> {
200    // Conversion functions.
201    fn get_null(_ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
202        Ok(AnyValue::Null)
203    }
204
205    fn get_bool(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
206        let b = ob.extract::<bool>()?;
207        Ok(AnyValue::Boolean(b))
208    }
209
210    fn get_int(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
211        if let Ok(v) = ob.extract::<i64>() {
212            Ok(AnyValue::Int64(v))
213        } else if let Ok(v) = ob.extract::<i128>() {
214            Ok(AnyValue::Int128(v))
215        } else if !strict {
216            let f = ob.extract::<f64>()?;
217            Ok(AnyValue::Float64(f))
218        } else {
219            Err(PyOverflowError::new_err(format!(
220                "int value too large for Polars integer types: {ob}"
221            )))
222        }
223    }
224
225    fn get_float(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
226        Ok(AnyValue::Float64(ob.extract::<f64>()?))
227    }
228
229    fn get_str(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
230        // Ideally we'd be returning an AnyValue::String(&str) instead, as was
231        // the case in previous versions of this function. However, if compiling
232        // with abi3 for versions older than Python 3.10, the APIs that purport
233        // to return &str actually just encode to UTF-8 as a newly allocated
234        // PyBytes object, and then return reference to that. So what we're
235        // doing here isn't any different fundamentally, and the APIs to for
236        // converting to &str are deprecated in PyO3 0.21.
237        //
238        // Once Python 3.10 is the minimum supported version, converting to &str
239        // will be cheaper, and we should do that. Python 3.9 security updates
240        // end-of-life is Oct 31, 2025.
241        Ok(AnyValue::StringOwned(ob.extract::<String>()?.into()))
242    }
243
244    fn get_bytes<'py>(ob: &Bound<'py, PyAny>, _strict: bool) -> PyResult<AnyValue<'py>> {
245        let value = ob.extract::<Vec<u8>>()?;
246        Ok(AnyValue::BinaryOwned(value))
247    }
248
249    fn get_date(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
250        const UNIX_EPOCH: NaiveDate = DateTime::UNIX_EPOCH.naive_utc().date();
251        let date = ob.extract::<NaiveDate>()?;
252        let elapsed = date.signed_duration_since(UNIX_EPOCH);
253        Ok(AnyValue::Date(elapsed.num_days() as i32))
254    }
255
256    fn get_datetime(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
257        let py = ob.py();
258        let tzinfo = ob.getattr(intern!(py, "tzinfo"))?;
259
260        if tzinfo.is_none() {
261            let datetime = ob.extract::<NaiveDateTime>()?;
262            let delta = datetime - DateTime::UNIX_EPOCH.naive_utc();
263            let timestamp = delta.num_microseconds().unwrap();
264            return Ok(AnyValue::Datetime(timestamp, TimeUnit::Microseconds, None));
265        }
266
267        // Try converting `pytz` timezone to `zoneinfo` timezone
268        let (ob, tzinfo) = if let Some(tz) = tzinfo
269            .getattr(intern!(py, "zone"))
270            .ok()
271            .and_then(|zone| zone.extract::<PyBackedStr>().ok()?.parse::<Tz>().ok())
272        {
273            let tzinfo = tz.into_pyobject(py)?;
274            (
275                &ob.call_method(intern!(py, "astimezone"), (&tzinfo,), None)?,
276                tzinfo,
277            )
278        } else {
279            (ob, tzinfo)
280        };
281
282        let (timestamp, tz) = if tzinfo.hasattr(intern!(py, "key"))? {
283            let datetime = ob.extract::<DateTime<Tz>>()?;
284            let tz = datetime.timezone().name().into();
285            if datetime.year() >= 2100 {
286                // chrono-tz does not support dates after 2100
287                // https://github.com/chronotope/chrono-tz/issues/135
288                (
289                    pl_utils(py)
290                        .bind(py)
291                        .getattr(intern!(py, "datetime_to_int"))?
292                        .call1((ob, intern!(py, "us")))?
293                        .extract::<i64>()?,
294                    tz,
295                )
296            } else {
297                let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
298                (delta.num_microseconds().unwrap(), tz)
299            }
300        } else {
301            let datetime = ob.extract::<DateTime<FixedOffset>>()?;
302            let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
303            (delta.num_microseconds().unwrap(), "UTC".into())
304        };
305
306        Ok(AnyValue::DatetimeOwned(
307            timestamp,
308            TimeUnit::Microseconds,
309            Some(Arc::new(tz)),
310        ))
311    }
312
313    fn get_timedelta(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
314        let timedelta = ob.extract::<TimeDelta>()?;
315        if let Some(micros) = timedelta.num_microseconds() {
316            Ok(AnyValue::Duration(micros, TimeUnit::Microseconds))
317        } else {
318            Ok(AnyValue::Duration(
319                timedelta.num_milliseconds(),
320                TimeUnit::Milliseconds,
321            ))
322        }
323    }
324
325    fn get_time(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
326        let time = ob.extract::<NaiveTime>()?;
327
328        Ok(AnyValue::Time(
329            (time.num_seconds_from_midnight() as i64) * 1_000_000_000 + time.nanosecond() as i64,
330        ))
331    }
332
333    fn get_decimal(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
334        fn abs_decimal_from_digits(
335            digits: impl IntoIterator<Item = u8>,
336            exp: i32,
337        ) -> Option<(i128, usize)> {
338            const MAX_ABS_DEC: i128 = 10_i128.pow(38) - 1;
339            let mut v = 0_i128;
340            for (i, d) in digits.into_iter().map(i128::from).enumerate() {
341                if i < 38 {
342                    v = v * 10 + d;
343                } else {
344                    v = v.checked_mul(10).and_then(|v| v.checked_add(d))?;
345                }
346            }
347            // We only support non-negative scale (=> non-positive exponent).
348            let scale = if exp > 0 {
349                // The decimal may be in a non-canonical representation, try to fix it first.
350                v = 10_i128
351                    .checked_pow(exp as u32)
352                    .and_then(|factor| v.checked_mul(factor))?;
353                0
354            } else {
355                (-exp) as usize
356            };
357            // TODO: Do we care for checking if it fits in MAX_ABS_DEC? (if we set precision to None anyway?)
358            (v <= MAX_ABS_DEC).then_some((v, scale))
359        }
360
361        // Note: Using Vec<u8> is not the most efficient thing here (input is a tuple)
362        let (sign, digits, exp): (i8, Vec<u8>, i32) = ob
363            .call_method0(intern!(ob.py(), "as_tuple"))
364            .unwrap()
365            .extract()
366            .unwrap();
367        let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
368            PyErr::from(PyPolarsErr::Other(
369                "Decimal is too large to fit in Decimal128".into(),
370            ))
371        })?;
372        if sign > 0 {
373            v = -v; // Won't overflow since -i128::MAX > i128::MIN
374        }
375        Ok(AnyValue::Decimal(v, scale))
376    }
377
378    fn get_list(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {
379        fn get_list_with_constructor(
380            ob: &Bound<'_, PyAny>,
381            strict: bool,
382        ) -> PyResult<AnyValue<'static>> {
383            // Use the dedicated constructor.
384            // This constructor is able to go via dedicated type constructors
385            // so it can be much faster.
386            let py = ob.py();
387            let kwargs = PyDict::new(py);
388            kwargs.set_item("strict", strict)?;
389            let s = pl_series(py).call(py, (ob,), Some(&kwargs))?;
390            get_list_from_series(s.bind(py), strict)
391        }
392
393        if ob.is_empty()? {
394            Ok(AnyValue::List(Series::new_empty(
395                PlSmallStr::EMPTY,
396                &DataType::Null,
397            )))
398        } else if ob.is_instance_of::<PyList>() | ob.is_instance_of::<PyTuple>() {
399            const INFER_SCHEMA_LENGTH: usize = 25;
400
401            let list = ob.downcast::<PySequence>()?;
402
403            let mut avs = Vec::with_capacity(INFER_SCHEMA_LENGTH);
404            let mut iter = list.try_iter()?;
405            let mut items = Vec::with_capacity(INFER_SCHEMA_LENGTH);
406            for item in (&mut iter).take(INFER_SCHEMA_LENGTH) {
407                items.push(item?);
408                let av = py_object_to_any_value(items.last().unwrap(), strict, true)?;
409                avs.push(av)
410            }
411            let (dtype, n_dtypes) = any_values_to_supertype_and_n_dtypes(&avs)
412                .map_err(|e| PyTypeError::new_err(e.to_string()))?;
413
414            // This path is only taken if there is no question about the data type.
415            if dtype.is_primitive() && n_dtypes == 1 {
416                get_list_with_constructor(ob, strict)
417            } else {
418                // Push the rest.
419                let length = list.len()?;
420                avs.reserve(length);
421                let mut rest = Vec::with_capacity(length);
422                for item in iter {
423                    rest.push(item?);
424                    let av = py_object_to_any_value(rest.last().unwrap(), strict, true)?;
425                    avs.push(av)
426                }
427
428                let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, strict)
429                    .map_err(|e| {
430                        PyTypeError::new_err(format!(
431                            "{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types."
432                        ))
433                    })?;
434                Ok(AnyValue::List(s))
435            }
436        } else {
437            // range will take this branch
438            get_list_with_constructor(ob, strict)
439        }
440    }
441
442    fn get_list_from_series(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
443        let s = super::get_series(ob)?;
444        Ok(AnyValue::List(s))
445    }
446
447    fn get_struct<'py>(ob: &Bound<'py, PyAny>, strict: bool) -> PyResult<AnyValue<'py>> {
448        let dict = ob.downcast::<PyDict>().unwrap();
449        let len = dict.len();
450        let mut keys = Vec::with_capacity(len);
451        let mut vals = Vec::with_capacity(len);
452        for (k, v) in dict.into_iter() {
453            let key = k.extract::<Cow<str>>()?;
454            let val = py_object_to_any_value(&v, strict, true)?;
455            let dtype = val.dtype();
456            keys.push(Field::new(key.as_ref().into(), dtype));
457            vals.push(val)
458        }
459        Ok(AnyValue::StructOwned(Box::new((vals, keys))))
460    }
461
462    fn get_object(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
463        #[cfg(feature = "object")]
464        {
465            // This is slow, but hey don't use objects.
466            let v = &ObjectValue {
467                inner: ob.clone().unbind(),
468            };
469            Ok(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))
470        }
471        #[cfg(not(feature = "object"))]
472        panic!("activate object")
473    }
474
475    /// Determine which conversion function to use for the given object.
476    ///
477    /// Note: This function is only ran if the object's type is not already in the
478    /// lookup table.
479    fn get_conversion_function(ob: &Bound<'_, PyAny>, allow_object: bool) -> PyResult<InitFn> {
480        let py = ob.py();
481        if ob.is_none() {
482            Ok(get_null)
483        }
484        // bool must be checked before int because Python bool is an instance of int.
485        else if ob.is_instance_of::<PyBool>() {
486            Ok(get_bool)
487        } else if ob.is_instance_of::<PyInt>() {
488            Ok(get_int)
489        } else if ob.is_instance_of::<PyFloat>() {
490            Ok(get_float)
491        } else if ob.is_instance_of::<PyString>() {
492            Ok(get_str)
493        } else if ob.is_instance_of::<PyBytes>() {
494            Ok(get_bytes)
495        } else if ob.is_instance_of::<PyList>() || ob.is_instance_of::<PyTuple>() {
496            Ok(get_list)
497        } else if ob.is_instance_of::<PyDict>() {
498            Ok(get_struct)
499        } else {
500            let ob_type = ob.get_type();
501            let type_name = ob_type.fully_qualified_name()?.to_string();
502            match type_name.as_str() {
503                // Can't use pyo3::types::PyDateTime with abi3-py37 feature,
504                // so need this workaround instead of `isinstance(ob, datetime)`.
505                "datetime.date" => Ok(get_date as InitFn),
506                "datetime.time" => Ok(get_time as InitFn),
507                "datetime.datetime" => Ok(get_datetime as InitFn),
508                "datetime.timedelta" => Ok(get_timedelta as InitFn),
509                "decimal.Decimal" => Ok(get_decimal as InitFn),
510                "range" => Ok(get_list as InitFn),
511                _ => {
512                    // Support NumPy scalars.
513                    if ob.extract::<i64>().is_ok() || ob.extract::<u64>().is_ok() {
514                        return Ok(get_int as InitFn);
515                    } else if ob.extract::<f64>().is_ok() {
516                        return Ok(get_float as InitFn);
517                    }
518
519                    // Support custom subclasses of datetime/date.
520                    let ancestors = ob_type.getattr(intern!(py, "__mro__"))?;
521                    let ancestors_str_iter = ancestors
522                        .try_iter()?
523                        .map(|b| b.unwrap().str().unwrap().to_string());
524                    for c in ancestors_str_iter {
525                        match &*c {
526                            // datetime must be checked before date because
527                            // Python datetime is an instance of date.
528                            "<class 'datetime.datetime'>" => {
529                                return Ok(get_datetime as InitFn);
530                            },
531                            "<class 'datetime.date'>" => return Ok(get_date as InitFn),
532                            "<class 'datetime.timedelta'>" => return Ok(get_timedelta as InitFn),
533                            "<class 'datetime.time'>" => return Ok(get_time as InitFn),
534                            _ => (),
535                        }
536                    }
537
538                    if allow_object {
539                        Ok(get_object as InitFn)
540                    } else {
541                        Err(PyValueError::new_err(format!("Cannot convert {ob}")))
542                    }
543                },
544            }
545        }
546    }
547
548    let py_type = ob.get_type();
549    let py_type_address = py_type.as_ptr() as usize;
550
551    let conversion_func = {
552        if let Some(cached_func) = LUT.lock().unwrap().get(&py_type_address) {
553            *cached_func
554        } else {
555            let k = TypeObjectKey::new(py_type.clone().unbind());
556            assert_eq!(k.address, py_type_address);
557
558            let func = get_conversion_function(ob, allow_object)?;
559            LUT.lock().unwrap().insert(k, func);
560            func
561        }
562    };
563
564    conversion_func(ob, strict)
565}