1use arrow::array::View;
2use hashbrown::hash_table::{
3 Entry as TEntry, HashTable, OccupiedEntry as TOccupiedEntry, VacantEntry as TVacantEntry,
4};
5use polars_utils::IdxSize;
6
7const BASE_KEY_BUFFER_CAPACITY: usize = 1024;
8
9struct Key {
10 hash: u64,
11 view: View,
12}
13
14pub struct BinaryViewIndexMap<V> {
17 table: HashTable<IdxSize>,
18 tuples: Vec<(Key, V)>,
19 buffers: Vec<Vec<u8>>,
20
21 seed: u64,
24}
25
26impl<V> Default for BinaryViewIndexMap<V> {
27 fn default() -> Self {
28 Self {
29 table: HashTable::new(),
30 tuples: Vec::new(),
31 buffers: vec![],
32 seed: rand::random::<u64>() | 1,
33 }
34 }
35}
36
37impl<V> BinaryViewIndexMap<V> {
38 pub fn new() -> Self {
39 Self::default()
40 }
41
42 pub fn reserve(&mut self, additional: usize) {
43 self.table.reserve(additional, |i| unsafe {
44 let tuple = self.tuples.get_unchecked(*i as usize);
45 tuple.0.hash.wrapping_mul(self.seed)
46 });
47 self.tuples.reserve(additional);
48 }
49
50 pub fn len(&self) -> IdxSize {
51 self.tuples.len() as IdxSize
52 }
53
54 pub fn is_empty(&self) -> bool {
55 self.tuples.is_empty()
56 }
57
58 pub fn buffers(&self) -> &[Vec<u8>] {
59 &self.buffers
60 }
61
62 #[inline]
63 pub fn get(&self, hash: u64, key: &[u8]) -> Option<&V> {
64 unsafe {
65 if key.len() <= View::MAX_INLINE_SIZE as usize {
66 self.get_inline_view(hash, &View::new_inline_unchecked(key))
67 } else {
68 self.get_long_key(hash, key)
69 }
70 }
71 }
72
73 #[inline]
76 pub unsafe fn get_view<B: AsRef<[u8]>>(
77 &self,
78 hash: u64,
79 key: &View,
80 buffers: &[B],
81 ) -> Option<&V> {
82 unsafe {
83 if key.length <= View::MAX_INLINE_SIZE {
84 self.get_inline_view(hash, key)
85 } else {
86 self.get_long_key(hash, key.get_external_slice_unchecked(buffers))
87 }
88 }
89 }
90
91 pub unsafe fn get_inline_view(&self, hash: u64, key: &View) -> Option<&V> {
94 unsafe {
95 debug_assert!(key.length <= View::MAX_INLINE_SIZE);
96 let idx = self.table.find(hash.wrapping_mul(self.seed), |i| {
97 let t = self.tuples.get_unchecked(*i as usize);
98 *key == t.0.view
99 })?;
100 Some(&self.tuples.get_unchecked(*idx as usize).1)
101 }
102 }
103
104 pub unsafe fn get_long_key(&self, hash: u64, key: &[u8]) -> Option<&V> {
107 unsafe {
108 debug_assert!(key.len() > View::MAX_INLINE_SIZE as usize);
109 let idx = self.table.find(hash.wrapping_mul(self.seed), |i| {
110 let t = self.tuples.get_unchecked(*i as usize);
111 hash == t.0.hash
112 && key.len() == t.0.view.length as usize
113 && key == t.0.view.get_external_slice_unchecked(&self.buffers)
114 })?;
115 Some(&self.tuples.get_unchecked(*idx as usize).1)
116 }
117 }
118
119 #[inline]
120 pub fn entry<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {
121 unsafe {
122 if key.len() <= View::MAX_INLINE_SIZE as usize {
123 self.entry_inline_view(hash, View::new_inline_unchecked(key))
124 } else {
125 self.entry_long_key(hash, key)
126 }
127 }
128 }
129
130 #[inline]
133 pub unsafe fn entry_view<'k, B: AsRef<[u8]>>(
134 &mut self,
135 hash: u64,
136 key: View,
137 buffers: &'k [B],
138 ) -> Entry<'_, 'k, V> {
139 unsafe {
140 if key.length <= View::MAX_INLINE_SIZE {
141 self.entry_inline_view(hash, key)
142 } else {
143 self.entry_long_key(hash, key.get_external_slice_unchecked(buffers))
144 }
145 }
146 }
147
148 pub unsafe fn entry_inline_view<'k>(&mut self, hash: u64, key: View) -> Entry<'_, 'k, V> {
151 debug_assert!(key.length <= View::MAX_INLINE_SIZE);
152 let entry = self.table.entry(
153 hash.wrapping_mul(self.seed),
154 |i| unsafe {
155 let t = self.tuples.get_unchecked(*i as usize);
156 key == t.0.view
157 },
158 |i| unsafe {
159 let t = self.tuples.get_unchecked(*i as usize);
160 t.0.hash.wrapping_mul(self.seed)
161 },
162 );
163
164 match entry {
165 TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {
166 entry: o,
167 tuples: &mut self.tuples,
168 }),
169 TEntry::Vacant(v) => Entry::Vacant(VacantEntry {
170 view: key,
171 external: None,
172 hash,
173 entry: v,
174 tuples: &mut self.tuples,
175 buffers: &mut self.buffers,
176 }),
177 }
178 }
179
180 pub unsafe fn entry_long_key<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {
183 debug_assert!(key.len() > View::MAX_INLINE_SIZE as usize);
184 let entry = self.table.entry(
185 hash.wrapping_mul(self.seed),
186 |i| unsafe {
187 let t = self.tuples.get_unchecked(*i as usize);
188 hash == t.0.hash
189 && key.len() == t.0.view.length as usize
190 && key == t.0.view.get_external_slice_unchecked(&self.buffers)
191 },
192 |i| unsafe {
193 let t = self.tuples.get_unchecked(*i as usize);
194 t.0.hash.wrapping_mul(self.seed)
195 },
196 );
197
198 match entry {
199 TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {
200 entry: o,
201 tuples: &mut self.tuples,
202 }),
203 TEntry::Vacant(v) => Entry::Vacant(VacantEntry {
204 view: View::default(),
205 external: Some(key),
206 hash,
207 entry: v,
208 tuples: &mut self.tuples,
209 buffers: &mut self.buffers,
210 }),
211 }
212 }
213
214 pub fn push_unmapped_empty_entry(&mut self, value: V) -> IdxSize {
218 let ret = self.tuples.len() as IdxSize;
219 let key = Key {
220 hash: 0,
221 view: View::default(),
222 };
223 self.tuples.push((key, value));
224 ret
225 }
226
227 #[inline(always)]
229 pub fn get_index(&self, idx: IdxSize) -> Option<(u64, &[u8], &V)> {
230 let t = self.tuples.get(idx as usize)?;
231 Some((
232 t.0.hash,
233 unsafe { t.0.view.get_slice_unchecked(&self.buffers) },
234 &t.1,
235 ))
236 }
237
238 #[inline(always)]
243 pub unsafe fn get_index_unchecked(&self, idx: IdxSize) -> (u64, &[u8], &V) {
244 let t = unsafe { self.tuples.get_unchecked(idx as usize) };
245 unsafe { (t.0.hash, t.0.view.get_slice_unchecked(&self.buffers), &t.1) }
246 }
247
248 #[inline(always)]
253 pub unsafe fn get_index_view_unchecked(&self, idx: IdxSize) -> (u64, View, &V) {
254 let t = unsafe { self.tuples.get_unchecked(idx as usize) };
255 (t.0.hash, t.0.view, &t.1)
256 }
257
258 pub fn iter_hash_keys(&self) -> impl Iterator<Item = (u64, &[u8])> {
260 self.tuples
261 .iter()
262 .map(|t| unsafe { (t.0.hash, t.0.view.get_slice_unchecked(&self.buffers)) })
263 }
264
265 pub fn iter_hash_views(&self) -> impl Iterator<Item = (u64, View)> {
267 self.tuples.iter().map(|t| (t.0.hash, t.0.view))
268 }
269
270 pub fn iter_values(&self) -> impl Iterator<Item = &V> {
272 self.tuples.iter().map(|t| &t.1)
273 }
274}
275
276pub enum Entry<'a, 'k, V> {
277 Occupied(OccupiedEntry<'a, V>),
278 Vacant(VacantEntry<'a, 'k, V>),
279}
280
281pub struct OccupiedEntry<'a, V> {
282 entry: TOccupiedEntry<'a, IdxSize>,
283 tuples: &'a mut Vec<(Key, V)>,
284}
285
286impl<'a, V> OccupiedEntry<'a, V> {
287 #[inline]
288 pub fn index(&self) -> IdxSize {
289 *self.entry.get()
290 }
291
292 #[inline]
293 pub fn into_mut(self) -> &'a mut V {
294 let idx = self.index();
295 unsafe { &mut self.tuples.get_unchecked_mut(idx as usize).1 }
296 }
297}
298
299pub struct VacantEntry<'a, 'k, V> {
300 hash: u64,
301 view: View, external: Option<&'k [u8]>, entry: TVacantEntry<'a, IdxSize>,
304 tuples: &'a mut Vec<(Key, V)>,
305 buffers: &'a mut Vec<Vec<u8>>,
306}
307
308#[allow(clippy::needless_lifetimes)]
309impl<'a, 'k, V> VacantEntry<'a, 'k, V> {
310 #[inline]
311 pub fn index(&self) -> IdxSize {
312 self.tuples.len() as IdxSize
313 }
314
315 #[inline]
316 pub fn insert(self, value: V) -> &'a mut V {
317 unsafe {
318 let tuple_idx: IdxSize = self.tuples.len().try_into().unwrap();
319 let view = if let Some(key) = self.external {
320 if self
321 .buffers
322 .last()
323 .is_none_or(|buf| buf.len() + key.len() > buf.capacity())
324 {
325 let ideal_next_cap = BASE_KEY_BUFFER_CAPACITY
326 .checked_shl(self.buffers.len() as u32)
327 .unwrap();
328 let next_capacity = std::cmp::max(ideal_next_cap, key.len());
329 self.buffers.push(Vec::with_capacity(next_capacity));
330 }
331 let buffer_idx = (self.buffers.len() - 1) as u32;
332 let active_buf = self.buffers.last_mut().unwrap_unchecked();
333 let offset = active_buf.len() as u32;
334 active_buf.extend_from_slice(key);
335 View::new_from_bytes(key, buffer_idx, offset)
336 } else {
337 self.view
338 };
339 let tuple_key = Key {
340 hash: self.hash,
341 view,
342 };
343 self.tuples.push((tuple_key, value));
344 self.entry.insert(tuple_idx);
345 &mut self.tuples.last_mut().unwrap_unchecked().1
346 }
347 }
348}