polars_row/
row.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2use std::sync::Arc;
3
4use arrow::array::{BinaryArray, BinaryViewArray};
5use arrow::datatypes::ArrowDataType;
6use arrow::ffi::mmap;
7use arrow::offset::{Offsets, OffsetsBuffer};
8use polars_compute::cast::binary_to_binview;
9use polars_dtype::categorical::CategoricalMapping;
10
11const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
12const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;
13
14/// Additional context provided to row encoding regarding a column.
15///
16/// This allows communication based on the Polars datatype instead on the Arrow datatype. Since
17/// polars-row is used under polars-core, we don't have access to the actual datatypes.
18#[derive(Debug, Clone)]
19pub enum RowEncodingContext {
20    Struct(Vec<Option<RowEncodingContext>>),
21    /// Categorical / Enum
22    Categorical(RowEncodingCategoricalContext),
23    /// Decimal with given precision
24    Decimal(usize),
25}
26
27#[derive(Debug, Clone)]
28pub struct RowEncodingCategoricalContext {
29    pub is_enum: bool,
30    pub mapping: Arc<CategoricalMapping>,
31}
32
33bitflags::bitflags! {
34    /// Options for the Polars Row Encoding.
35    ///
36    /// The row encoding provides a method to combine several columns into one binary column which
37    /// has the same sort-order as the original columns.test
38    ///
39    /// By default, the row encoding provides the ascending, nulls first sort-order of the columns.
40    #[derive(Debug, Clone, Copy, Default)]
41    pub struct RowEncodingOptions: u8 {
42        /// Sort in descending order instead of ascending order
43        const DESCENDING               = 0x01;
44        /// Sort such that nulls / missing values are last
45        const NULLS_LAST               = 0x02;
46
47        /// Ignore all order-related flags and don't encode order-preserving. This will keep
48        /// uniqueness.
49        ///
50        /// This is faster for several encodings
51        const NO_ORDER                 = 0x04;
52    }
53}
54
55const LIST_CONTINUATION_TOKEN: u8 = 0xFE;
56const EMPTY_STR_TOKEN: u8 = 0x01;
57
58impl RowEncodingOptions {
59    pub fn new_sorted(descending: bool, nulls_last: bool) -> Self {
60        let mut slf = Self::default();
61        slf.set(Self::DESCENDING, descending);
62        slf.set(Self::NULLS_LAST, nulls_last);
63        slf
64    }
65
66    pub fn new_unsorted() -> Self {
67        Self::NO_ORDER
68    }
69
70    pub fn is_ordered(self) -> bool {
71        !self.contains(Self::NO_ORDER)
72    }
73
74    pub fn null_sentinel(self) -> u8 {
75        if self.contains(Self::NULLS_LAST) {
76            0xFF
77        } else {
78            0x00
79        }
80    }
81
82    pub(crate) fn bool_true_sentinel(self) -> u8 {
83        if self.contains(Self::DESCENDING) {
84            !BOOLEAN_TRUE_SENTINEL
85        } else {
86            BOOLEAN_TRUE_SENTINEL
87        }
88    }
89
90    pub(crate) fn bool_false_sentinel(self) -> u8 {
91        if self.contains(Self::DESCENDING) {
92            !BOOLEAN_FALSE_SENTINEL
93        } else {
94            BOOLEAN_FALSE_SENTINEL
95        }
96    }
97
98    pub fn list_null_sentinel(self) -> u8 {
99        self.null_sentinel()
100    }
101
102    pub fn list_continuation_token(self) -> u8 {
103        if self.contains(Self::DESCENDING) {
104            !LIST_CONTINUATION_TOKEN
105        } else {
106            LIST_CONTINUATION_TOKEN
107        }
108    }
109
110    pub fn list_termination_token(self) -> u8 {
111        !self.list_continuation_token()
112    }
113
114    pub fn empty_str_token(self) -> u8 {
115        if self.contains(Self::DESCENDING) {
116            !EMPTY_STR_TOKEN
117        } else {
118            EMPTY_STR_TOKEN
119        }
120    }
121
122    pub fn into_nested(mut self) -> RowEncodingOptions {
123        // Correct nested ordering (see #22557)
124        self.set(
125            RowEncodingOptions::NULLS_LAST,
126            self.contains(RowEncodingOptions::DESCENDING),
127        );
128        self
129    }
130}
131
132#[derive(Default, Clone)]
133pub struct RowsEncoded {
134    pub(crate) values: Vec<u8>,
135    pub(crate) offsets: Vec<usize>,
136}
137
138unsafe fn rows_to_array(buf: Vec<u8>, offsets: Vec<usize>) -> BinaryArray<i64> {
139    let offsets = if size_of::<usize>() == size_of::<i64>() {
140        assert!(
141            (*offsets.last().unwrap() as u64) < i64::MAX as u64,
142            "row encoding output overflowed"
143        );
144
145        // SAFETY: we checked overflow and size
146        bytemuck::cast_vec::<usize, i64>(offsets)
147    } else {
148        offsets.into_iter().map(|v| v as i64).collect()
149    };
150
151    // SAFETY: monotonically increasing
152    let offsets = Offsets::new_unchecked(offsets);
153
154    BinaryArray::new(ArrowDataType::LargeBinary, offsets.into(), buf.into(), None)
155}
156
157impl RowsEncoded {
158    pub(crate) fn new(values: Vec<u8>, offsets: Vec<usize>) -> Self {
159        RowsEncoded { values, offsets }
160    }
161
162    pub fn iter(&self) -> RowsEncodedIter<'_> {
163        let iter = self.offsets[1..].iter();
164        let offset = self.offsets[0];
165        RowsEncodedIter {
166            offset,
167            end: iter,
168            values: &self.values,
169        }
170    }
171
172    /// Borrows the buffers and returns a [`BinaryArray`].
173    ///
174    /// # Safety
175    /// The lifetime of that `BinaryArray` is tied to the lifetime of
176    /// `Self`. The caller must ensure that both stay alive for the same time.
177    pub unsafe fn borrow_array(&self) -> BinaryArray<i64> {
178        let (_, values, _) = unsafe { mmap::slice(&self.values) }.into_inner();
179        let offsets = if size_of::<usize>() == size_of::<i64>() {
180            assert!(
181                (*self.offsets.last().unwrap() as u64) < i64::MAX as u64,
182                "row encoding output overflowed"
183            );
184
185            let offsets = bytemuck::cast_slice::<usize, i64>(self.offsets.as_slice());
186            let (_, offsets, _) = unsafe { mmap::slice(offsets) }.into_inner();
187            offsets
188        } else {
189            self.offsets.iter().map(|&v| v as i64).collect()
190        };
191        let offsets = OffsetsBuffer::new_unchecked(offsets);
192
193        BinaryArray::new(ArrowDataType::LargeBinary, offsets, values, None)
194    }
195
196    /// This conversion is free.
197    pub fn into_array(self) -> BinaryArray<i64> {
198        unsafe { rows_to_array(self.values, self.offsets) }
199    }
200
201    /// This does allocate views.
202    pub fn into_binview(self) -> BinaryViewArray {
203        binary_to_binview(&self.into_array())
204    }
205
206    #[cfg(test)]
207    pub fn get(&self, i: usize) -> &[u8] {
208        let start = self.offsets[i];
209        let end = self.offsets[i + 1];
210        &self.values[start..end]
211    }
212}
213
214pub struct RowsEncodedIter<'a> {
215    offset: usize,
216    end: std::slice::Iter<'a, usize>,
217    values: &'a [u8],
218}
219
220impl<'a> Iterator for RowsEncodedIter<'a> {
221    type Item = &'a [u8];
222
223    fn next(&mut self) -> Option<Self::Item> {
224        let new_offset = *self.end.next()?;
225        let payload = unsafe { self.values.get_unchecked(self.offset..new_offset) };
226        self.offset = new_offset;
227        Some(payload)
228    }
229
230    fn size_hint(&self) -> (usize, Option<usize>) {
231        self.end.size_hint()
232    }
233}