polars_compute/
binview_index_map.rs

1use arrow::array::View;
2use hashbrown::hash_table::{
3    Entry as TEntry, HashTable, OccupiedEntry as TOccupiedEntry, VacantEntry as TVacantEntry,
4};
5use polars_utils::IdxSize;
6
7const BASE_KEY_BUFFER_CAPACITY: usize = 1024;
8
9struct Key {
10    hash: u64,
11    view: View,
12}
13
14/// An IndexMap where the keys are [u8] slices or `View`s which are pre-hashed.
15/// Does not support deletion.
16pub struct BinaryViewIndexMap<V> {
17    table: HashTable<IdxSize>,
18    tuples: Vec<(Key, V)>,
19    buffers: Vec<Vec<u8>>,
20
21    // Internal random seed used to keep hash iteration order decorrelated.
22    // We simply store a random odd number and multiply the canonical hash by it.
23    seed: u64,
24}
25
26impl<V> Default for BinaryViewIndexMap<V> {
27    fn default() -> Self {
28        Self {
29            table: HashTable::new(),
30            tuples: Vec::new(),
31            buffers: vec![],
32            seed: rand::random::<u64>() | 1,
33        }
34    }
35}
36
37impl<V> BinaryViewIndexMap<V> {
38    pub fn new() -> Self {
39        Self::default()
40    }
41
42    pub fn reserve(&mut self, additional: usize) {
43        self.table.reserve(additional, |i| unsafe {
44            let tuple = self.tuples.get_unchecked(*i as usize);
45            tuple.0.hash.wrapping_mul(self.seed)
46        });
47        self.tuples.reserve(additional);
48    }
49
50    pub fn len(&self) -> IdxSize {
51        self.tuples.len() as IdxSize
52    }
53
54    pub fn is_empty(&self) -> bool {
55        self.tuples.is_empty()
56    }
57
58    pub fn buffers(&self) -> &[Vec<u8>] {
59        &self.buffers
60    }
61
62    #[inline]
63    pub fn get(&self, hash: u64, key: &[u8]) -> Option<&V> {
64        unsafe {
65            if key.len() <= View::MAX_INLINE_SIZE as usize {
66                self.get_inline_view(hash, &View::new_inline_unchecked(key))
67            } else {
68                self.get_long_key(hash, key)
69            }
70        }
71    }
72
73    /// # Safety
74    /// The view must be valid in combination with the given buffers.
75    #[inline]
76    pub unsafe fn get_view<B: AsRef<[u8]>>(
77        &self,
78        hash: u64,
79        key: &View,
80        buffers: &[B],
81    ) -> Option<&V> {
82        unsafe {
83            if key.length <= View::MAX_INLINE_SIZE {
84                self.get_inline_view(hash, key)
85            } else {
86                self.get_long_key(hash, key.get_external_slice_unchecked(buffers))
87            }
88        }
89    }
90
91    /// # Safety
92    /// The view must be inlined.
93    pub unsafe fn get_inline_view(&self, hash: u64, key: &View) -> Option<&V> {
94        unsafe {
95            debug_assert!(key.length <= View::MAX_INLINE_SIZE);
96            let idx = self.table.find(hash.wrapping_mul(self.seed), |i| {
97                let t = self.tuples.get_unchecked(*i as usize);
98                *key == t.0.view
99            })?;
100            Some(&self.tuples.get_unchecked(*idx as usize).1)
101        }
102    }
103
104    /// # Safety
105    /// key.len() > View::MAX_INLINE_SIZE
106    pub unsafe fn get_long_key(&self, hash: u64, key: &[u8]) -> Option<&V> {
107        unsafe {
108            debug_assert!(key.len() > View::MAX_INLINE_SIZE as usize);
109            let idx = self.table.find(hash.wrapping_mul(self.seed), |i| {
110                let t = self.tuples.get_unchecked(*i as usize);
111                hash == t.0.hash
112                    && key.len() == t.0.view.length as usize
113                    && key == t.0.view.get_external_slice_unchecked(&self.buffers)
114            })?;
115            Some(&self.tuples.get_unchecked(*idx as usize).1)
116        }
117    }
118
119    #[inline]
120    pub fn entry<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {
121        unsafe {
122            if key.len() <= View::MAX_INLINE_SIZE as usize {
123                self.entry_inline_view(hash, View::new_inline_unchecked(key))
124            } else {
125                self.entry_long_key(hash, key)
126            }
127        }
128    }
129
130    /// # Safety
131    /// The view must be valid in combination with the given buffers.
132    #[inline]
133    pub unsafe fn entry_view<'k, B: AsRef<[u8]>>(
134        &mut self,
135        hash: u64,
136        key: View,
137        buffers: &'k [B],
138    ) -> Entry<'_, 'k, V> {
139        unsafe {
140            if key.length <= View::MAX_INLINE_SIZE {
141                self.entry_inline_view(hash, key)
142            } else {
143                self.entry_long_key(hash, key.get_external_slice_unchecked(buffers))
144            }
145        }
146    }
147
148    /// # Safety
149    /// The view must be inlined.
150    pub unsafe fn entry_inline_view<'k>(&mut self, hash: u64, key: View) -> Entry<'_, 'k, V> {
151        debug_assert!(key.length <= View::MAX_INLINE_SIZE);
152        let entry = self.table.entry(
153            hash.wrapping_mul(self.seed),
154            |i| unsafe {
155                let t = self.tuples.get_unchecked(*i as usize);
156                key == t.0.view
157            },
158            |i| unsafe {
159                let t = self.tuples.get_unchecked(*i as usize);
160                t.0.hash.wrapping_mul(self.seed)
161            },
162        );
163
164        match entry {
165            TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {
166                entry: o,
167                tuples: &mut self.tuples,
168            }),
169            TEntry::Vacant(v) => Entry::Vacant(VacantEntry {
170                view: key,
171                external: None,
172                hash,
173                entry: v,
174                tuples: &mut self.tuples,
175                buffers: &mut self.buffers,
176            }),
177        }
178    }
179
180    /// # Safety
181    /// key.len() > View::MAX_INLINE_SIZE
182    pub unsafe fn entry_long_key<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {
183        debug_assert!(key.len() > View::MAX_INLINE_SIZE as usize);
184        let entry = self.table.entry(
185            hash.wrapping_mul(self.seed),
186            |i| unsafe {
187                let t = self.tuples.get_unchecked(*i as usize);
188                hash == t.0.hash
189                    && key.len() == t.0.view.length as usize
190                    && key == t.0.view.get_external_slice_unchecked(&self.buffers)
191            },
192            |i| unsafe {
193                let t = self.tuples.get_unchecked(*i as usize);
194                t.0.hash.wrapping_mul(self.seed)
195            },
196        );
197
198        match entry {
199            TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {
200                entry: o,
201                tuples: &mut self.tuples,
202            }),
203            TEntry::Vacant(v) => Entry::Vacant(VacantEntry {
204                view: View::default(),
205                external: Some(key),
206                hash,
207                entry: v,
208                tuples: &mut self.tuples,
209                buffers: &mut self.buffers,
210            }),
211        }
212    }
213
214    /// Insert an empty entry which will never be mapped to. Returns the index of the entry.
215    ///
216    /// This is useful for entries which are handled externally.
217    pub fn push_unmapped_empty_entry(&mut self, value: V) -> IdxSize {
218        let ret = self.tuples.len() as IdxSize;
219        let key = Key {
220            hash: 0,
221            view: View::default(),
222        };
223        self.tuples.push((key, value));
224        ret
225    }
226
227    /// Gets the hash, key and value at the given index by insertion order.
228    #[inline(always)]
229    pub fn get_index(&self, idx: IdxSize) -> Option<(u64, &[u8], &V)> {
230        let t = self.tuples.get(idx as usize)?;
231        Some((
232            t.0.hash,
233            unsafe { t.0.view.get_slice_unchecked(&self.buffers) },
234            &t.1,
235        ))
236    }
237
238    /// Gets the hash, key and value at the given index by insertion order.
239    ///
240    /// # Safety
241    /// The index must be less than len().
242    #[inline(always)]
243    pub unsafe fn get_index_unchecked(&self, idx: IdxSize) -> (u64, &[u8], &V) {
244        let t = unsafe { self.tuples.get_unchecked(idx as usize) };
245        unsafe { (t.0.hash, t.0.view.get_slice_unchecked(&self.buffers), &t.1) }
246    }
247
248    /// Gets the hash, view and value at the given index by insertion order.
249    ///
250    /// # Safety
251    /// The index must be less than len().
252    #[inline(always)]
253    pub unsafe fn get_index_view_unchecked(&self, idx: IdxSize) -> (u64, View, &V) {
254        let t = unsafe { self.tuples.get_unchecked(idx as usize) };
255        (t.0.hash, t.0.view, &t.1)
256    }
257
258    /// Iterates over the (hash, key) pairs in insertion order.
259    pub fn iter_hash_keys(&self) -> impl Iterator<Item = (u64, &[u8])> {
260        self.tuples
261            .iter()
262            .map(|t| unsafe { (t.0.hash, t.0.view.get_slice_unchecked(&self.buffers)) })
263    }
264
265    /// Iterates over the (hash, key_view) pairs in insertion order.
266    pub fn iter_hash_views(&self) -> impl Iterator<Item = (u64, View)> {
267        self.tuples.iter().map(|t| (t.0.hash, t.0.view))
268    }
269
270    /// Iterates over the values in insertion order.
271    pub fn iter_values(&self) -> impl Iterator<Item = &V> {
272        self.tuples.iter().map(|t| &t.1)
273    }
274}
275
276pub enum Entry<'a, 'k, V> {
277    Occupied(OccupiedEntry<'a, V>),
278    Vacant(VacantEntry<'a, 'k, V>),
279}
280
281pub struct OccupiedEntry<'a, V> {
282    entry: TOccupiedEntry<'a, IdxSize>,
283    tuples: &'a mut Vec<(Key, V)>,
284}
285
286impl<'a, V> OccupiedEntry<'a, V> {
287    #[inline]
288    pub fn index(&self) -> IdxSize {
289        *self.entry.get()
290    }
291
292    #[inline]
293    pub fn into_mut(self) -> &'a mut V {
294        let idx = self.index();
295        unsafe { &mut self.tuples.get_unchecked_mut(idx as usize).1 }
296    }
297}
298
299pub struct VacantEntry<'a, 'k, V> {
300    hash: u64,
301    view: View,                 // Empty when key is not inlined.
302    external: Option<&'k [u8]>, // Only set when not inlined.
303    entry: TVacantEntry<'a, IdxSize>,
304    tuples: &'a mut Vec<(Key, V)>,
305    buffers: &'a mut Vec<Vec<u8>>,
306}
307
308#[allow(clippy::needless_lifetimes)]
309impl<'a, 'k, V> VacantEntry<'a, 'k, V> {
310    #[inline]
311    pub fn index(&self) -> IdxSize {
312        self.tuples.len() as IdxSize
313    }
314
315    #[inline]
316    pub fn insert(self, value: V) -> &'a mut V {
317        unsafe {
318            let tuple_idx: IdxSize = self.tuples.len().try_into().unwrap();
319            let view = if let Some(key) = self.external {
320                if self
321                    .buffers
322                    .last()
323                    .is_none_or(|buf| buf.len() + key.len() > buf.capacity())
324                {
325                    let ideal_next_cap = BASE_KEY_BUFFER_CAPACITY
326                        .checked_shl(self.buffers.len() as u32)
327                        .unwrap();
328                    let next_capacity = std::cmp::max(ideal_next_cap, key.len());
329                    self.buffers.push(Vec::with_capacity(next_capacity));
330                }
331                let buffer_idx = (self.buffers.len() - 1) as u32;
332                let active_buf = self.buffers.last_mut().unwrap_unchecked();
333                let offset = active_buf.len() as u32;
334                active_buf.extend_from_slice(key);
335                View::new_from_bytes(key, buffer_idx, offset)
336            } else {
337                self.view
338            };
339            let tuple_key = Key {
340                hash: self.hash,
341                view,
342            };
343            self.tuples.push((tuple_key, value));
344            self.entry.insert(tuple_idx);
345            &mut self.tuples.last_mut().unwrap_unchecked().1
346        }
347    }
348}