1#![allow(unsafe_op_in_unsafe_fn)]
2use std::sync::Arc;
3
4use arrow::array::{BinaryArray, BinaryViewArray};
5use arrow::datatypes::ArrowDataType;
6use arrow::ffi::mmap;
7use arrow::offset::{Offsets, OffsetsBuffer};
8use polars_compute::cast::binary_to_binview;
9use polars_dtype::categorical::CategoricalMapping;
10
11const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
12const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;
13
14#[derive(Debug, Clone)]
19pub enum RowEncodingContext {
20 Struct(Vec<Option<RowEncodingContext>>),
21 Categorical(RowEncodingCategoricalContext),
23 Decimal(usize),
25}
26
27#[derive(Debug, Clone)]
28pub struct RowEncodingCategoricalContext {
29 pub is_enum: bool,
30 pub mapping: Arc<CategoricalMapping>,
31}
32
33bitflags::bitflags! {
34 #[derive(Debug, Clone, Copy, Default)]
41 pub struct RowEncodingOptions: u8 {
42 const DESCENDING = 0x01;
44 const NULLS_LAST = 0x02;
46
47 const NO_ORDER = 0x04;
52 }
53}
54
55const LIST_CONTINUATION_TOKEN: u8 = 0xFE;
56const EMPTY_STR_TOKEN: u8 = 0x01;
57
58impl RowEncodingOptions {
59 pub fn new_sorted(descending: bool, nulls_last: bool) -> Self {
60 let mut slf = Self::default();
61 slf.set(Self::DESCENDING, descending);
62 slf.set(Self::NULLS_LAST, nulls_last);
63 slf
64 }
65
66 pub fn new_unsorted() -> Self {
67 Self::NO_ORDER
68 }
69
70 pub fn is_ordered(self) -> bool {
71 !self.contains(Self::NO_ORDER)
72 }
73
74 pub fn null_sentinel(self) -> u8 {
75 if self.contains(Self::NULLS_LAST) {
76 0xFF
77 } else {
78 0x00
79 }
80 }
81
82 pub(crate) fn bool_true_sentinel(self) -> u8 {
83 if self.contains(Self::DESCENDING) {
84 !BOOLEAN_TRUE_SENTINEL
85 } else {
86 BOOLEAN_TRUE_SENTINEL
87 }
88 }
89
90 pub(crate) fn bool_false_sentinel(self) -> u8 {
91 if self.contains(Self::DESCENDING) {
92 !BOOLEAN_FALSE_SENTINEL
93 } else {
94 BOOLEAN_FALSE_SENTINEL
95 }
96 }
97
98 pub fn list_null_sentinel(self) -> u8 {
99 self.null_sentinel()
100 }
101
102 pub fn list_continuation_token(self) -> u8 {
103 if self.contains(Self::DESCENDING) {
104 !LIST_CONTINUATION_TOKEN
105 } else {
106 LIST_CONTINUATION_TOKEN
107 }
108 }
109
110 pub fn list_termination_token(self) -> u8 {
111 !self.list_continuation_token()
112 }
113
114 pub fn empty_str_token(self) -> u8 {
115 if self.contains(Self::DESCENDING) {
116 !EMPTY_STR_TOKEN
117 } else {
118 EMPTY_STR_TOKEN
119 }
120 }
121
122 pub fn into_nested(mut self) -> RowEncodingOptions {
123 self.set(
125 RowEncodingOptions::NULLS_LAST,
126 self.contains(RowEncodingOptions::DESCENDING),
127 );
128 self
129 }
130}
131
132#[derive(Default, Clone)]
133pub struct RowsEncoded {
134 pub(crate) values: Vec<u8>,
135 pub(crate) offsets: Vec<usize>,
136}
137
138unsafe fn rows_to_array(buf: Vec<u8>, offsets: Vec<usize>) -> BinaryArray<i64> {
139 let offsets = if size_of::<usize>() == size_of::<i64>() {
140 assert!(
141 (*offsets.last().unwrap() as u64) < i64::MAX as u64,
142 "row encoding output overflowed"
143 );
144
145 bytemuck::cast_vec::<usize, i64>(offsets)
147 } else {
148 offsets.into_iter().map(|v| v as i64).collect()
149 };
150
151 let offsets = Offsets::new_unchecked(offsets);
153
154 BinaryArray::new(ArrowDataType::LargeBinary, offsets.into(), buf.into(), None)
155}
156
157impl RowsEncoded {
158 pub(crate) fn new(values: Vec<u8>, offsets: Vec<usize>) -> Self {
159 RowsEncoded { values, offsets }
160 }
161
162 pub fn iter(&self) -> RowsEncodedIter<'_> {
163 let iter = self.offsets[1..].iter();
164 let offset = self.offsets[0];
165 RowsEncodedIter {
166 offset,
167 end: iter,
168 values: &self.values,
169 }
170 }
171
172 pub unsafe fn borrow_array(&self) -> BinaryArray<i64> {
178 let (_, values, _) = unsafe { mmap::slice(&self.values) }.into_inner();
179 let offsets = if size_of::<usize>() == size_of::<i64>() {
180 assert!(
181 (*self.offsets.last().unwrap() as u64) < i64::MAX as u64,
182 "row encoding output overflowed"
183 );
184
185 let offsets = bytemuck::cast_slice::<usize, i64>(self.offsets.as_slice());
186 let (_, offsets, _) = unsafe { mmap::slice(offsets) }.into_inner();
187 offsets
188 } else {
189 self.offsets.iter().map(|&v| v as i64).collect()
190 };
191 let offsets = OffsetsBuffer::new_unchecked(offsets);
192
193 BinaryArray::new(ArrowDataType::LargeBinary, offsets, values, None)
194 }
195
196 pub fn into_array(self) -> BinaryArray<i64> {
198 unsafe { rows_to_array(self.values, self.offsets) }
199 }
200
201 pub fn into_binview(self) -> BinaryViewArray {
203 binary_to_binview(&self.into_array())
204 }
205
206 #[cfg(test)]
207 pub fn get(&self, i: usize) -> &[u8] {
208 let start = self.offsets[i];
209 let end = self.offsets[i + 1];
210 &self.values[start..end]
211 }
212}
213
214pub struct RowsEncodedIter<'a> {
215 offset: usize,
216 end: std::slice::Iter<'a, usize>,
217 values: &'a [u8],
218}
219
220impl<'a> Iterator for RowsEncodedIter<'a> {
221 type Item = &'a [u8];
222
223 fn next(&mut self) -> Option<Self::Item> {
224 let new_offset = *self.end.next()?;
225 let payload = unsafe { self.values.get_unchecked(self.offset..new_offset) };
226 self.offset = new_offset;
227 Some(payload)
228 }
229
230 fn size_hint(&self) -> (usize, Option<usize>) {
231 self.end.size_hint()
232 }
233}