1#![allow(unsafe_op_in_unsafe_fn)]
2use arrow::array::{BinaryArray, BinaryViewArray};
3use arrow::datatypes::ArrowDataType;
4use arrow::ffi::mmap;
5use arrow::offset::{Offsets, OffsetsBuffer};
6use polars_compute::cast::binary_to_binview;
7
8const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
9const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;
10
11#[derive(Debug, Clone)]
16pub enum RowEncodingContext {
17 Struct(Vec<Option<RowEncodingContext>>),
18 Categorical(RowEncodingCategoricalContext),
20 Decimal(usize),
22}
23
24#[derive(Debug, Clone)]
25pub struct RowEncodingCategoricalContext {
26 pub num_known_categories: u32,
28 pub is_enum: bool,
29
30 pub lexical_sort_idxs: Option<Vec<u32>>,
32}
33
34impl RowEncodingCategoricalContext {
35 pub fn needed_num_bits(&self) -> usize {
36 if self.num_known_categories == 0 {
37 0
38 } else {
39 let max_category_index = self.num_known_categories - 1;
40 (max_category_index.next_power_of_two().trailing_zeros() + 1) as usize
41 }
42 }
43}
44
45bitflags::bitflags! {
46 #[derive(Debug, Clone, Copy, Default)]
53 pub struct RowEncodingOptions: u8 {
54 const DESCENDING = 0x01;
56 const NULLS_LAST = 0x02;
58
59 const NO_ORDER = 0x04;
64 }
65}
66
67const LIST_CONTINUATION_TOKEN: u8 = 0xFE;
68const EMPTY_STR_TOKEN: u8 = 0x01;
69
70impl RowEncodingOptions {
71 pub fn new_sorted(descending: bool, nulls_last: bool) -> Self {
72 let mut slf = Self::default();
73 slf.set(Self::DESCENDING, descending);
74 slf.set(Self::NULLS_LAST, nulls_last);
75 slf
76 }
77
78 pub fn new_unsorted() -> Self {
79 Self::NO_ORDER
80 }
81
82 pub fn is_ordered(self) -> bool {
83 !self.contains(Self::NO_ORDER)
84 }
85
86 pub fn null_sentinel(self) -> u8 {
87 if self.contains(Self::NULLS_LAST) {
88 0xFF
89 } else {
90 0x00
91 }
92 }
93
94 pub(crate) fn bool_true_sentinel(self) -> u8 {
95 if self.contains(Self::DESCENDING) {
96 !BOOLEAN_TRUE_SENTINEL
97 } else {
98 BOOLEAN_TRUE_SENTINEL
99 }
100 }
101
102 pub(crate) fn bool_false_sentinel(self) -> u8 {
103 if self.contains(Self::DESCENDING) {
104 !BOOLEAN_FALSE_SENTINEL
105 } else {
106 BOOLEAN_FALSE_SENTINEL
107 }
108 }
109
110 pub fn list_null_sentinel(self) -> u8 {
111 self.null_sentinel()
112 }
113
114 pub fn list_continuation_token(self) -> u8 {
115 if self.contains(Self::DESCENDING) {
116 !LIST_CONTINUATION_TOKEN
117 } else {
118 LIST_CONTINUATION_TOKEN
119 }
120 }
121
122 pub fn list_termination_token(self) -> u8 {
123 !self.list_continuation_token()
124 }
125
126 pub fn empty_str_token(self) -> u8 {
127 if self.contains(Self::DESCENDING) {
128 !EMPTY_STR_TOKEN
129 } else {
130 EMPTY_STR_TOKEN
131 }
132 }
133
134 pub fn into_nested(mut self) -> RowEncodingOptions {
135 self.set(
137 RowEncodingOptions::NULLS_LAST,
138 self.contains(RowEncodingOptions::DESCENDING),
139 );
140 self
141 }
142}
143
144#[derive(Default, Clone)]
145pub struct RowsEncoded {
146 pub(crate) values: Vec<u8>,
147 pub(crate) offsets: Vec<usize>,
148}
149
150unsafe fn rows_to_array(buf: Vec<u8>, offsets: Vec<usize>) -> BinaryArray<i64> {
151 let offsets = if size_of::<usize>() == size_of::<i64>() {
152 assert!(
153 (*offsets.last().unwrap() as u64) < i64::MAX as u64,
154 "row encoding output overflowed"
155 );
156
157 bytemuck::cast_vec::<usize, i64>(offsets)
159 } else {
160 offsets.into_iter().map(|v| v as i64).collect()
161 };
162
163 let offsets = Offsets::new_unchecked(offsets);
165
166 BinaryArray::new(ArrowDataType::LargeBinary, offsets.into(), buf.into(), None)
167}
168
169impl RowsEncoded {
170 pub(crate) fn new(values: Vec<u8>, offsets: Vec<usize>) -> Self {
171 RowsEncoded { values, offsets }
172 }
173
174 pub fn iter(&self) -> RowsEncodedIter {
175 let iter = self.offsets[1..].iter();
176 let offset = self.offsets[0];
177 RowsEncodedIter {
178 offset,
179 end: iter,
180 values: &self.values,
181 }
182 }
183
184 pub unsafe fn borrow_array(&self) -> BinaryArray<i64> {
190 let (_, values, _) = unsafe { mmap::slice(&self.values) }.into_inner();
191 let offsets = if size_of::<usize>() == size_of::<i64>() {
192 assert!(
193 (*self.offsets.last().unwrap() as u64) < i64::MAX as u64,
194 "row encoding output overflowed"
195 );
196
197 let offsets = bytemuck::cast_slice::<usize, i64>(self.offsets.as_slice());
198 let (_, offsets, _) = unsafe { mmap::slice(offsets) }.into_inner();
199 offsets
200 } else {
201 self.offsets.iter().map(|&v| v as i64).collect()
202 };
203 let offsets = OffsetsBuffer::new_unchecked(offsets);
204
205 BinaryArray::new(ArrowDataType::LargeBinary, offsets, values, None)
206 }
207
208 pub fn into_array(self) -> BinaryArray<i64> {
210 unsafe { rows_to_array(self.values, self.offsets) }
211 }
212
213 pub fn into_binview(self) -> BinaryViewArray {
215 binary_to_binview(&self.into_array())
216 }
217
218 #[cfg(test)]
219 pub fn get(&self, i: usize) -> &[u8] {
220 let start = self.offsets[i];
221 let end = self.offsets[i + 1];
222 &self.values[start..end]
223 }
224}
225
226pub struct RowsEncodedIter<'a> {
227 offset: usize,
228 end: std::slice::Iter<'a, usize>,
229 values: &'a [u8],
230}
231
232impl<'a> Iterator for RowsEncodedIter<'a> {
233 type Item = &'a [u8];
234
235 fn next(&mut self) -> Option<Self::Item> {
236 let new_offset = *self.end.next()?;
237 let payload = unsafe { self.values.get_unchecked(self.offset..new_offset) };
238 self.offset = new_offset;
239 Some(payload)
240 }
241
242 fn size_hint(&self) -> (usize, Option<usize>) {
243 self.end.size_hint()
244 }
245}