polars_row/
row.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2use arrow::array::{BinaryArray, BinaryViewArray};
3use arrow::datatypes::ArrowDataType;
4use arrow::ffi::mmap;
5use arrow::offset::{Offsets, OffsetsBuffer};
6use polars_compute::cast::binary_to_binview;
7
8const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
9const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;
10
11/// Additional context provided to row encoding regarding a column.
12///
13/// This allows communication based on the Polars datatype instead on the Arrow datatype. Since
14/// polars-row is used under polars-core, we don't have access to the actual datatypes.
15#[derive(Debug, Clone)]
16pub enum RowEncodingContext {
17    Struct(Vec<Option<RowEncodingContext>>),
18    /// Categorical / Enum
19    Categorical(RowEncodingCategoricalContext),
20    /// Decimal with given precision
21    Decimal(usize),
22}
23
24#[derive(Debug, Clone)]
25pub struct RowEncodingCategoricalContext {
26    /// The number of known categories in categorical / enum currently.
27    pub num_known_categories: u32,
28    pub is_enum: bool,
29
30    /// The mapping from key to lexical sort index
31    pub lexical_sort_idxs: Option<Vec<u32>>,
32}
33
34impl RowEncodingCategoricalContext {
35    pub fn needed_num_bits(&self) -> usize {
36        if self.num_known_categories == 0 {
37            0
38        } else {
39            let max_category_index = self.num_known_categories - 1;
40            (max_category_index.next_power_of_two().trailing_zeros() + 1) as usize
41        }
42    }
43}
44
45bitflags::bitflags! {
46    /// Options for the Polars Row Encoding.
47    ///
48    /// The row encoding provides a method to combine several columns into one binary column which
49    /// has the same sort-order as the original columns.test
50    ///
51    /// By default, the row encoding provides the ascending, nulls first sort-order of the columns.
52    #[derive(Debug, Clone, Copy, Default)]
53    pub struct RowEncodingOptions: u8 {
54        /// Sort in descending order instead of ascending order
55        const DESCENDING               = 0x01;
56        /// Sort such that nulls / missing values are last
57        const NULLS_LAST               = 0x02;
58
59        /// Ignore all order-related flags and don't encode order-preserving. This will keep
60        /// uniqueness.
61        ///
62        /// This is faster for several encodings
63        const NO_ORDER                 = 0x04;
64    }
65}
66
67const LIST_CONTINUATION_TOKEN: u8 = 0xFE;
68const EMPTY_STR_TOKEN: u8 = 0x01;
69
70impl RowEncodingOptions {
71    pub fn new_sorted(descending: bool, nulls_last: bool) -> Self {
72        let mut slf = Self::default();
73        slf.set(Self::DESCENDING, descending);
74        slf.set(Self::NULLS_LAST, nulls_last);
75        slf
76    }
77
78    pub fn new_unsorted() -> Self {
79        Self::NO_ORDER
80    }
81
82    pub fn is_ordered(self) -> bool {
83        !self.contains(Self::NO_ORDER)
84    }
85
86    pub fn null_sentinel(self) -> u8 {
87        if self.contains(Self::NULLS_LAST) {
88            0xFF
89        } else {
90            0x00
91        }
92    }
93
94    pub(crate) fn bool_true_sentinel(self) -> u8 {
95        if self.contains(Self::DESCENDING) {
96            !BOOLEAN_TRUE_SENTINEL
97        } else {
98            BOOLEAN_TRUE_SENTINEL
99        }
100    }
101
102    pub(crate) fn bool_false_sentinel(self) -> u8 {
103        if self.contains(Self::DESCENDING) {
104            !BOOLEAN_FALSE_SENTINEL
105        } else {
106            BOOLEAN_FALSE_SENTINEL
107        }
108    }
109
110    pub fn list_null_sentinel(self) -> u8 {
111        self.null_sentinel()
112    }
113
114    pub fn list_continuation_token(self) -> u8 {
115        if self.contains(Self::DESCENDING) {
116            !LIST_CONTINUATION_TOKEN
117        } else {
118            LIST_CONTINUATION_TOKEN
119        }
120    }
121
122    pub fn list_termination_token(self) -> u8 {
123        !self.list_continuation_token()
124    }
125
126    pub fn empty_str_token(self) -> u8 {
127        if self.contains(Self::DESCENDING) {
128            !EMPTY_STR_TOKEN
129        } else {
130            EMPTY_STR_TOKEN
131        }
132    }
133
134    pub fn into_nested(mut self) -> RowEncodingOptions {
135        // Correct nested ordering (see #22557)
136        self.set(
137            RowEncodingOptions::NULLS_LAST,
138            self.contains(RowEncodingOptions::DESCENDING),
139        );
140        self
141    }
142}
143
144#[derive(Default, Clone)]
145pub struct RowsEncoded {
146    pub(crate) values: Vec<u8>,
147    pub(crate) offsets: Vec<usize>,
148}
149
150unsafe fn rows_to_array(buf: Vec<u8>, offsets: Vec<usize>) -> BinaryArray<i64> {
151    let offsets = if size_of::<usize>() == size_of::<i64>() {
152        assert!(
153            (*offsets.last().unwrap() as u64) < i64::MAX as u64,
154            "row encoding output overflowed"
155        );
156
157        // SAFETY: we checked overflow and size
158        bytemuck::cast_vec::<usize, i64>(offsets)
159    } else {
160        offsets.into_iter().map(|v| v as i64).collect()
161    };
162
163    // SAFETY: monotonically increasing
164    let offsets = Offsets::new_unchecked(offsets);
165
166    BinaryArray::new(ArrowDataType::LargeBinary, offsets.into(), buf.into(), None)
167}
168
169impl RowsEncoded {
170    pub(crate) fn new(values: Vec<u8>, offsets: Vec<usize>) -> Self {
171        RowsEncoded { values, offsets }
172    }
173
174    pub fn iter(&self) -> RowsEncodedIter {
175        let iter = self.offsets[1..].iter();
176        let offset = self.offsets[0];
177        RowsEncodedIter {
178            offset,
179            end: iter,
180            values: &self.values,
181        }
182    }
183
184    /// Borrows the buffers and returns a [`BinaryArray`].
185    ///
186    /// # Safety
187    /// The lifetime of that `BinaryArray` is tied to the lifetime of
188    /// `Self`. The caller must ensure that both stay alive for the same time.
189    pub unsafe fn borrow_array(&self) -> BinaryArray<i64> {
190        let (_, values, _) = unsafe { mmap::slice(&self.values) }.into_inner();
191        let offsets = if size_of::<usize>() == size_of::<i64>() {
192            assert!(
193                (*self.offsets.last().unwrap() as u64) < i64::MAX as u64,
194                "row encoding output overflowed"
195            );
196
197            let offsets = bytemuck::cast_slice::<usize, i64>(self.offsets.as_slice());
198            let (_, offsets, _) = unsafe { mmap::slice(offsets) }.into_inner();
199            offsets
200        } else {
201            self.offsets.iter().map(|&v| v as i64).collect()
202        };
203        let offsets = OffsetsBuffer::new_unchecked(offsets);
204
205        BinaryArray::new(ArrowDataType::LargeBinary, offsets, values, None)
206    }
207
208    /// This conversion is free.
209    pub fn into_array(self) -> BinaryArray<i64> {
210        unsafe { rows_to_array(self.values, self.offsets) }
211    }
212
213    /// This does allocate views.
214    pub fn into_binview(self) -> BinaryViewArray {
215        binary_to_binview(&self.into_array())
216    }
217
218    #[cfg(test)]
219    pub fn get(&self, i: usize) -> &[u8] {
220        let start = self.offsets[i];
221        let end = self.offsets[i + 1];
222        &self.values[start..end]
223    }
224}
225
226pub struct RowsEncodedIter<'a> {
227    offset: usize,
228    end: std::slice::Iter<'a, usize>,
229    values: &'a [u8],
230}
231
232impl<'a> Iterator for RowsEncodedIter<'a> {
233    type Item = &'a [u8];
234
235    fn next(&mut self) -> Option<Self::Item> {
236        let new_offset = *self.end.next()?;
237        let payload = unsafe { self.values.get_unchecked(self.offset..new_offset) };
238        self.offset = new_offset;
239        Some(payload)
240    }
241
242    fn size_hint(&self) -> (usize, Option<usize>) {
243        self.end.size_hint()
244    }
245}