polars_python/series/
buffers.rs

1//! Construct and deconstruct Series based on the underlying buffers.
2//!
3//! This functionality is mainly intended for use with the Python dataframe
4//! interchange protocol.
5//!
6//! As Polars has no Buffer concept in Python, each buffer is represented as
7//! a Series of its physical type.
8//!
9//! Note that String Series have underlying `Utf8View` buffers, which
10//! currently cannot be represented as Series. Since the interchange protocol
11//! cannot handle these buffers anyway and expects bytes and offsets buffers,
12//! operations on String Series will convert from/to such buffers. This
13//! conversion requires data to be copied.
14
15use arrow::array::{Array, BooleanArray, PrimitiveArray, Utf8Array};
16use arrow::bitmap::Bitmap;
17use arrow::buffer::Buffer;
18use arrow::offset::OffsetsBuffer;
19use arrow::types::NativeType;
20use polars::prelude::*;
21use polars_core::{with_match_physical_numeric_polars_type, with_match_physical_numeric_type};
22use pyo3::exceptions::PyTypeError;
23use pyo3::prelude::*;
24use pyo3::types::PyTuple;
25
26use super::{PySeries, ToSeries};
27use crate::conversion::Wrap;
28use crate::error::PyPolarsErr;
29use crate::raise_err;
30use crate::utils::EnterPolarsExt;
31
32struct BufferInfo {
33    pointer: usize,
34    offset: usize,
35    length: usize,
36}
37impl<'py> IntoPyObject<'py> for BufferInfo {
38    type Target = PyTuple;
39    type Output = Bound<'py, Self::Target>;
40    type Error = PyErr;
41
42    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
43        (self.pointer, self.offset, self.length).into_pyobject(py)
44    }
45}
46impl<'py> FromPyObject<'py> for BufferInfo {
47    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
48        let (pointer, offset, length) = ob.extract()?;
49        Ok(Self {
50            pointer,
51            offset,
52            length,
53        })
54    }
55}
56
57#[pymethods]
58impl PySeries {
59    /// Return pointer, offset, and length information about the underlying buffer.
60    fn _get_buffer_info(&self) -> PyResult<BufferInfo> {
61        let s = self.series.to_physical_repr();
62        let arrays = s.chunks();
63        if arrays.len() != 1 {
64            let msg = "cannot get buffer info for Series consisting of multiple chunks";
65            raise_err!(msg, ComputeError);
66        }
67        match s.dtype() {
68            DataType::Boolean => {
69                let ca = s.bool().unwrap();
70                let arr = ca.downcast_iter().next().unwrap();
71                let (slice, offset, len) = arr.values().as_slice();
72                Ok(BufferInfo {
73                    pointer: slice.as_ptr() as usize,
74                    offset,
75                    length: len,
76                })
77            },
78            dt if dt.is_primitive_numeric() => {
79                Ok(with_match_physical_numeric_polars_type!(dt, |$T| {
80                    let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
81                    BufferInfo { pointer: get_pointer(ca), offset: 0, length: ca.len() }
82                }))
83            },
84            dt => {
85                let msg = format!(
86                    "`_get_buffer_info` not implemented for non-physical type {dt}; try to select a buffer first"
87                );
88                Err(PyTypeError::new_err(msg))
89            },
90        }
91    }
92
93    /// Return the underlying values, validity, and offsets buffers as Series.
94    fn _get_buffers(&self, py: Python) -> PyResult<(Self, Option<Self>, Option<Self>)> {
95        let s = &self.series;
96        py.enter_polars(|| match s.dtype().to_physical() {
97            dt if dt.is_primitive_numeric() => get_buffers_from_primitive(s),
98            DataType::Boolean => get_buffers_from_primitive(s),
99            DataType::String => get_buffers_from_string(s),
100            dt => {
101                let msg = format!("`_get_buffers` not implemented for `dtype` {dt}");
102                Err(PyTypeError::new_err(msg))
103            },
104        })
105    }
106}
107
108fn get_pointer<T: PolarsNumericType>(ca: &ChunkedArray<T>) -> usize {
109    let arr = ca.downcast_iter().next().unwrap();
110    arr.values().as_ptr() as usize
111}
112
113fn get_buffers_from_primitive(
114    s: &Series,
115) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {
116    let chunks = s
117        .chunks()
118        .iter()
119        .map(|arr| arr.with_validity(None))
120        .collect::<Vec<_>>();
121    let values = Series::try_from((s.name().clone(), chunks))
122        .map_err(PyPolarsErr::from)?
123        .into();
124
125    let validity = get_bitmap(s);
126    let offsets = None;
127    Ok((values, validity, offsets))
128}
129
130/// The underlying buffers for `String` Series cannot be represented in this
131/// format. Instead, the buffers are converted to a values and offsets buffer.
132/// This copies data.
133fn get_buffers_from_string(s: &Series) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {
134    // We cannot do this zero copy anyway, so rechunk first
135    let s = s.rechunk();
136
137    let ca = s.str().map_err(PyPolarsErr::from)?;
138    let arr_binview = ca.downcast_iter().next().unwrap();
139
140    // This is not zero-copy
141    let arr_utf8 = polars_compute::cast::utf8view_to_utf8(arr_binview);
142
143    let values = get_string_bytes(&arr_utf8)?;
144    let validity = get_bitmap(&s);
145    let offsets = get_string_offsets(&arr_utf8)?;
146
147    Ok((values, validity, Some(offsets)))
148}
149
150fn get_bitmap(s: &Series) -> Option<PySeries> {
151    if s.null_count() > 0 {
152        Some(s.is_not_null().into_series().into())
153    } else {
154        None
155    }
156}
157
158fn get_string_bytes(arr: &Utf8Array<i64>) -> PyResult<PySeries> {
159    let values_buffer = arr.values();
160    let values_arr =
161        PrimitiveArray::<u8>::try_new(ArrowDataType::UInt8, values_buffer.clone(), None)
162            .map_err(PyPolarsErr::from)?;
163    let values = Series::from_arrow(PlSmallStr::EMPTY, values_arr.to_boxed())
164        .map_err(PyPolarsErr::from)?
165        .into();
166    Ok(values)
167}
168
169fn get_string_offsets(arr: &Utf8Array<i64>) -> PyResult<PySeries> {
170    let offsets_buffer = arr.offsets().buffer();
171    let offsets_arr =
172        PrimitiveArray::<i64>::try_new(ArrowDataType::Int64, offsets_buffer.clone(), None)
173            .map_err(PyPolarsErr::from)?;
174    let offsets = Series::from_arrow(PlSmallStr::EMPTY, offsets_arr.to_boxed())
175        .map_err(PyPolarsErr::from)?
176        .into();
177    Ok(offsets)
178}
179
180#[pymethods]
181impl PySeries {
182    /// Construct a PySeries from information about its underlying buffer.
183    #[staticmethod]
184    unsafe fn _from_buffer(
185        dtype: Wrap<DataType>,
186        buffer_info: BufferInfo,
187        owner: &Bound<'_, PyAny>,
188    ) -> PyResult<Self> {
189        let dtype = dtype.0;
190        let BufferInfo {
191            pointer,
192            offset,
193            length,
194        } = buffer_info;
195        let owner = owner.to_owned().unbind();
196
197        let arr_boxed = match dtype {
198            dt if dt.is_primitive_numeric() => {
199                with_match_physical_numeric_type!(dt, |$T|  unsafe {
200                    from_buffer_impl::<$T>(pointer, offset, length, owner)
201                })
202            },
203            DataType::Boolean => {
204                unsafe { from_buffer_boolean_impl(pointer, offset, length, owner) }?
205            },
206            dt => {
207                let msg = format!(
208                    "`_from_buffer` requires a physical type as input for `dtype`, got {dt}"
209                );
210                return Err(PyTypeError::new_err(msg));
211            },
212        };
213
214        let s = Series::from_arrow(PlSmallStr::EMPTY, arr_boxed)
215            .unwrap()
216            .into();
217        Ok(s)
218    }
219}
220
221unsafe fn from_buffer_impl<T: NativeType>(
222    pointer: usize,
223    offset: usize,
224    length: usize,
225    owner: Py<PyAny>,
226) -> Box<dyn Array> {
227    let pointer = pointer as *const T;
228    let pointer = unsafe { pointer.add(offset) };
229    let slice = unsafe { std::slice::from_raw_parts(pointer, length) };
230    let arr = unsafe { arrow::ffi::mmap::slice_and_owner(slice, owner) };
231    arr.to_boxed()
232}
233unsafe fn from_buffer_boolean_impl(
234    pointer: usize,
235    offset: usize,
236    length: usize,
237    owner: Py<PyAny>,
238) -> PyResult<Box<dyn Array>> {
239    let length_in_bytes = get_boolean_buffer_length_in_bytes(length, offset);
240
241    let pointer = pointer as *const u8;
242    let slice = unsafe { std::slice::from_raw_parts(pointer, length_in_bytes) };
243    let arr_result = unsafe { arrow::ffi::mmap::bitmap_and_owner(slice, offset, length, owner) };
244    let arr = arr_result.map_err(PyPolarsErr::from)?;
245    Ok(arr.to_boxed())
246}
247fn get_boolean_buffer_length_in_bytes(length: usize, offset: usize) -> usize {
248    let n_bits = offset + length;
249    let n_bytes = n_bits / 8;
250    let rest = n_bits % 8;
251    if rest == 0 { n_bytes } else { n_bytes + 1 }
252}
253
254#[pymethods]
255impl PySeries {
256    /// Construct a PySeries from information about its underlying buffers.
257    #[staticmethod]
258    #[pyo3(signature = (dtype, data, validity=None))]
259    unsafe fn _from_buffers(
260        py: Python<'_>,
261        dtype: Wrap<DataType>,
262        data: Vec<PySeries>,
263        validity: Option<PySeries>,
264    ) -> PyResult<Self> {
265        let dtype = dtype.0;
266        let mut data = data.to_series();
267
268        match data.len() {
269            0 => {
270                let msg = "`data` input to `_from_buffers` must contain at least one buffer";
271                return Err(PyTypeError::new_err(msg));
272            },
273            1 if validity.is_none() => {
274                let values = data.pop().unwrap();
275                let s = values.strict_cast(&dtype).map_err(PyPolarsErr::from)?;
276                return Ok(s.into());
277            },
278            _ => (),
279        }
280
281        let validity = match validity {
282            Some(s) => {
283                let dtype = s.series.dtype();
284                if !dtype.is_bool() {
285                    let msg = format!("validity buffer must have data type Boolean, got {dtype:?}");
286                    return Err(PyTypeError::new_err(msg));
287                }
288                Some(series_to_bitmap(s.series).unwrap())
289            },
290            None => None,
291        };
292
293        let s = match dtype.to_physical() {
294            dt if dt.is_primitive_numeric() => {
295                let values = data.into_iter().next().unwrap();
296                with_match_physical_numeric_polars_type!(dt, |$T| {
297                    let values_buffer = series_to_buffer::<$T>(values);
298                    from_buffers_num_impl::<<$T as PolarsNumericType>::Native>(values_buffer, validity)?
299                })
300            },
301            DataType::Boolean => {
302                let values = data.into_iter().next().unwrap();
303                let values_buffer = series_to_bitmap(values)?;
304                from_buffers_bool_impl(values_buffer, validity)?
305            },
306            DataType::String => {
307                let mut data_iter = data.into_iter();
308                let values = data_iter.next().unwrap();
309                let offsets = match data_iter.next() {
310                    Some(s) => {
311                        let dtype = s.dtype();
312                        if !matches!(dtype, DataType::Int64) {
313                            return Err(PyTypeError::new_err(format!(
314                                "offsets buffer must have data type Int64, got {dtype:?}"
315                            )));
316                        }
317                        series_to_offsets(s)
318                    },
319                    None => {
320                        return Err(PyTypeError::new_err(
321                            "`_from_buffers` cannot create a String column without an offsets buffer",
322                        ));
323                    },
324                };
325                let values = series_to_buffer::<UInt8Type>(values);
326                py.enter_polars(|| from_buffers_string_impl(values, validity, offsets))?
327            },
328            dt => {
329                let msg = format!("`_from_buffers` not implemented for `dtype` {dt}");
330                return Err(PyTypeError::new_err(msg));
331            },
332        };
333
334        let out = s.strict_cast(&dtype).map_err(PyPolarsErr::from)?;
335        Ok(out.into())
336    }
337}
338
339fn series_to_buffer<T>(s: Series) -> Buffer<T::Native>
340where
341    T: PolarsNumericType,
342{
343    let ca: &ChunkedArray<T> = s.as_ref().as_ref();
344    let ca = ca.rechunk();
345    ca.downcast_as_array().values().clone()
346}
347fn series_to_bitmap(s: Series) -> PyResult<Bitmap> {
348    let ca_result = s.bool();
349    let ca = ca_result.map_err(PyPolarsErr::from)?.rechunk();
350    Ok(ca.downcast_as_array().values().clone())
351}
352fn series_to_offsets(s: Series) -> OffsetsBuffer<i64> {
353    let buffer = series_to_buffer::<Int64Type>(s);
354    unsafe { OffsetsBuffer::new_unchecked(buffer) }
355}
356
357fn from_buffers_num_impl<T: NativeType>(
358    data: Buffer<T>,
359    validity: Option<Bitmap>,
360) -> PyResult<Series> {
361    let arr = PrimitiveArray::new(T::PRIMITIVE.into(), data, validity);
362    let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
363    let s = s_result.map_err(PyPolarsErr::from)?;
364    Ok(s)
365}
366fn from_buffers_bool_impl(data: Bitmap, validity: Option<Bitmap>) -> PyResult<Series> {
367    let arr = BooleanArray::new(ArrowDataType::Boolean, data, validity);
368    let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
369    let s = s_result.map_err(PyPolarsErr::from)?;
370    Ok(s)
371}
372/// Constructing a `String` Series requires specifying a values and offsets buffer,
373/// which does not match the actual underlying buffers. The values and offsets
374/// buffer are converted into the actual buffers, which copies data.
375fn from_buffers_string_impl(
376    data: Buffer<u8>,
377    validity: Option<Bitmap>,
378    offsets: OffsetsBuffer<i64>,
379) -> PyResult<Series> {
380    let arr = Utf8Array::new(ArrowDataType::LargeUtf8, offsets, data, validity);
381
382    // This is not zero-copy
383    let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
384
385    let s = s_result.map_err(PyPolarsErr::from)?;
386    Ok(s)
387}