Skip to main content

polars_arrow/bitmap/
builder.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2use polars_buffer::SharedStorage;
3use polars_utils::IdxSize;
4use polars_utils::slice::load_padded_le_u64;
5
6use super::bitmask::BitMask;
7use crate::bitmap::{Bitmap, MutableBitmap};
8use crate::trusted_len::TrustedLen;
9
10/// Used to build bitmaps bool-by-bool in sequential order.
11#[derive(Default, Clone)]
12pub struct BitmapBuilder {
13    buf: u64,                 // A buffer containing the last self.bit_len % 64 bits.
14    bit_len: usize,           // Length in bits.
15    bit_cap: usize,           // Capacity in bits (always multiple of 64).
16    set_bits_in_bytes: usize, // The number of bits set in self.bytes, not including self.buf.
17    bytes: Vec<u8>,
18}
19
20impl BitmapBuilder {
21    pub fn new() -> Self {
22        Self::default()
23    }
24
25    #[inline(always)]
26    pub fn len(&self) -> usize {
27        self.bit_len
28    }
29
30    #[inline(always)]
31    pub fn is_empty(&self) -> bool {
32        self.bit_len == 0
33    }
34
35    #[inline(always)]
36    pub fn capacity(&self) -> usize {
37        self.bit_cap
38    }
39
40    #[inline(always)]
41    pub fn set_bits(&self) -> usize {
42        self.set_bits_in_bytes + self.buf.count_ones() as usize
43    }
44
45    #[inline(always)]
46    pub fn unset_bits(&self) -> usize {
47        self.bit_len - self.set_bits()
48    }
49
50    pub fn with_capacity(bits: usize) -> Self {
51        let bytes = Vec::with_capacity(bits.div_ceil(64) * 8);
52        let words_available = bytes.capacity() / 8;
53        Self {
54            buf: 0,
55            bit_len: 0,
56            bit_cap: words_available * 64,
57            set_bits_in_bytes: 0,
58            bytes,
59        }
60    }
61
62    #[inline(always)]
63    pub fn reserve(&mut self, additional: usize) {
64        if self.bit_len + additional > self.bit_cap {
65            self.reserve_slow(additional)
66        }
67    }
68
69    #[cold]
70    #[inline(never)]
71    fn reserve_slow(&mut self, additional: usize) {
72        let bytes_needed = (self.bit_len + additional).div_ceil(64) * 8;
73        self.bytes.reserve(bytes_needed - self.bytes.len());
74        let words_available = self.bytes.capacity() / 8;
75        self.bit_cap = words_available * 64;
76    }
77
78    pub fn clear(&mut self) {
79        self.buf = 0;
80        self.bit_len = 0;
81        self.set_bits_in_bytes = 0;
82        self.bytes.clear();
83    }
84
85    #[inline(always)]
86    pub fn push(&mut self, x: bool) {
87        self.reserve(1);
88        unsafe { self.push_unchecked(x) }
89    }
90
91    /// Does not update len/set_bits, simply writes to the output buffer.
92    /// # Safety
93    /// self.bytes.len() + 8 <= self.bytes.capacity() must hold.
94    #[inline(always)]
95    unsafe fn flush_word_unchecked(&mut self, w: u64) {
96        let cur_len = self.bytes.len();
97        let p = self.bytes.as_mut_ptr().add(cur_len).cast::<u64>();
98        p.write_unaligned(w.to_le());
99        self.bytes.set_len(cur_len + 8);
100    }
101
102    /// # Safety
103    /// self.len() < self.capacity() must hold.
104    #[inline(always)]
105    pub unsafe fn push_unchecked(&mut self, x: bool) {
106        debug_assert!(self.bit_len < self.bit_cap);
107        self.buf |= (x as u64) << (self.bit_len % 64);
108        self.bit_len += 1;
109        if self.bit_len.is_multiple_of(64) {
110            self.flush_word_unchecked(self.buf);
111            self.set_bits_in_bytes += self.buf.count_ones() as usize;
112            self.buf = 0;
113        }
114    }
115
116    #[inline(always)]
117    pub fn extend_constant(&mut self, length: usize, value: bool) {
118        // Fast path if the extension still fits in buf with room left to spare.
119        let bits_in_buf = self.bit_len % 64;
120        if bits_in_buf + length < 64 {
121            let bit_block = ((value as u64) << length) - (value as u64);
122            self.buf |= bit_block << bits_in_buf;
123            self.bit_len += length;
124        } else {
125            self.extend_constant_slow(length, value);
126        }
127    }
128
129    #[cold]
130    fn extend_constant_slow(&mut self, length: usize, value: bool) {
131        unsafe {
132            let value_spread = if value { u64::MAX } else { 0 }; // Branchless neg.
133
134            // Extend and flush current buf.
135            self.reserve(length);
136            let bits_in_buf = self.bit_len % 64;
137            let ext_buf = self.buf | (value_spread << bits_in_buf);
138            self.flush_word_unchecked(ext_buf);
139            self.set_bits_in_bytes += ext_buf.count_ones() as usize;
140
141            // Write complete words.
142            let remaining_bits = length - (64 - bits_in_buf);
143            let remaining_words = remaining_bits / 64;
144            for _ in 0..remaining_words {
145                self.flush_word_unchecked(value_spread);
146            }
147            self.set_bits_in_bytes += (remaining_words * 64) & value_spread as usize;
148
149            // Put remainder in buf and update length.
150            self.buf = ((value as u64) << (remaining_bits % 64)) - (value as u64);
151            self.bit_len += length;
152        }
153    }
154
155    /// Pushes the first length bits from the given word, assuming the rest of
156    /// the bits are zero.
157    /// # Safety
158    /// self.len + length <= self.cap and length <= 64 must hold.
159    pub unsafe fn push_word_with_len_unchecked(&mut self, word: u64, length: usize) {
160        debug_assert!(self.bit_len + length <= self.bit_cap);
161        debug_assert!(length <= 64);
162        debug_assert!(length == 64 || (word >> length) == 0);
163        let bits_in_buf = self.bit_len % 64;
164        self.buf |= word << bits_in_buf;
165        if bits_in_buf + length >= 64 {
166            self.flush_word_unchecked(self.buf);
167            self.set_bits_in_bytes += self.buf.count_ones() as usize;
168            self.buf = if bits_in_buf > 0 {
169                word >> (64 - bits_in_buf)
170            } else {
171                0
172            };
173        }
174        self.bit_len += length;
175    }
176
177    /// # Safety
178    /// self.len() + length <= self.capacity() must hold, as well as
179    /// offset + length <= 8 * slice.len().
180    unsafe fn extend_from_slice_unchecked(
181        &mut self,
182        mut slice: &[u8],
183        mut offset: usize,
184        mut length: usize,
185    ) {
186        if length == 0 {
187            return;
188        }
189
190        // Deal with slice offset so it's aligned to bytes.
191        let slice_bit_offset = offset % 8;
192        if slice_bit_offset > 0 {
193            let bits_in_first_byte = (8 - slice_bit_offset).min(length);
194            let first_byte = *slice.get_unchecked(offset / 8) >> slice_bit_offset;
195            self.push_word_with_len_unchecked(
196                first_byte as u64 & ((1 << bits_in_first_byte) - 1),
197                bits_in_first_byte,
198            );
199            length -= bits_in_first_byte;
200            offset += bits_in_first_byte;
201        }
202        slice = slice.get_unchecked(offset / 8..);
203
204        // Write word-by-word.
205        let bits_in_buf = self.bit_len % 64;
206        if bits_in_buf > 0 {
207            while length >= 64 {
208                let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());
209                self.buf |= word << bits_in_buf;
210                self.flush_word_unchecked(self.buf);
211                self.set_bits_in_bytes += self.buf.count_ones() as usize;
212                self.buf = word >> (64 - bits_in_buf);
213                self.bit_len += 64;
214                length -= 64;
215                slice = slice.get_unchecked(8..);
216            }
217        } else {
218            while length >= 64 {
219                let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());
220                self.flush_word_unchecked(word);
221                self.set_bits_in_bytes += word.count_ones() as usize;
222                self.bit_len += 64;
223                length -= 64;
224                slice = slice.get_unchecked(8..);
225            }
226        }
227
228        // Just the last word left.
229        if length > 0 {
230            let word = load_padded_le_u64(slice);
231            self.push_word_with_len_unchecked(word & ((1 << length) - 1), length);
232        }
233    }
234
235    /// # Safety
236    /// self.len() + length*repeats <= self.capacity() must hold, as well as
237    /// offset + length <= 8 * slice.len().
238    unsafe fn extend_each_repeated_from_slice_unchecked(
239        &mut self,
240        slice: &[u8],
241        offset: usize,
242        length: usize,
243        repeats: usize,
244    ) {
245        if repeats == 0 {
246            return;
247        }
248        if repeats == 1 {
249            return self.extend_from_slice_unchecked(slice, offset, length);
250        }
251        for bit_idx in offset..length {
252            let bit = (*slice.get_unchecked(bit_idx / 8) >> (bit_idx % 8)) & 1 != 0;
253            self.extend_constant(repeats, bit);
254        }
255    }
256
257    pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) {
258        assert!(8 * slice.len() >= offset + length);
259        self.reserve(length);
260        unsafe {
261            self.extend_from_slice_unchecked(slice, offset, length);
262        }
263    }
264
265    pub fn extend_each_repeated_from_slice(
266        &mut self,
267        slice: &[u8],
268        offset: usize,
269        length: usize,
270        repeats: usize,
271    ) {
272        assert!(8 * slice.len() >= offset + length);
273        self.reserve(length * repeats);
274        unsafe {
275            self.extend_each_repeated_from_slice_unchecked(slice, offset, length, repeats);
276        }
277    }
278
279    pub fn extend_from_bitmap(&mut self, bitmap: &Bitmap) {
280        // TODO: we can perhaps use the bitmaps bitcount here instead of
281        // recomputing it if it has a known bitcount.
282        let (slice, offset, length) = bitmap.as_slice();
283        self.extend_from_slice(slice, offset, length);
284    }
285
286    pub fn extend_from_bitmask(&mut self, bitmap: BitMask<'_>) {
287        let (slice, offset, length) = bitmap.inner();
288        self.extend_from_slice(slice, offset, length);
289    }
290
291    /// Extends this BitmapBuilder with a subslice of a bitmap.
292    pub fn subslice_extend_from_bitmap(&mut self, bitmap: &Bitmap, start: usize, length: usize) {
293        let (slice, bm_offset, bm_length) = bitmap.as_slice();
294        assert!(start + length <= bm_length);
295        self.extend_from_slice(slice, bm_offset + start, length);
296    }
297
298    /// Extends this BitmapBuilder with a subslice of a bitmap, repeating each bit `repeats` times.
299    pub fn subslice_extend_each_repeated_from_bitmap(
300        &mut self,
301        bitmap: &Bitmap,
302        start: usize,
303        length: usize,
304        repeats: usize,
305    ) {
306        let (slice, bm_offset, bm_length) = bitmap.as_slice();
307        assert!(start + length <= bm_length);
308        self.extend_each_repeated_from_slice(slice, bm_offset + start, length, repeats);
309    }
310
311    pub fn subslice_extend_from_opt_validity(
312        &mut self,
313        bitmap: Option<&Bitmap>,
314        start: usize,
315        length: usize,
316    ) {
317        match bitmap {
318            Some(bm) => self.subslice_extend_from_bitmap(bm, start, length),
319            None => self.extend_constant(length, true),
320        }
321    }
322
323    pub fn subslice_extend_each_repeated_from_opt_validity(
324        &mut self,
325        bitmap: Option<&Bitmap>,
326        start: usize,
327        length: usize,
328        repeats: usize,
329    ) {
330        match bitmap {
331            Some(bm) => self.subslice_extend_each_repeated_from_bitmap(bm, start, length, repeats),
332            None => self.extend_constant(length * repeats, true),
333        }
334    }
335
336    /// # Safety
337    /// The indices must be in-bounds.
338    pub unsafe fn gather_extend_from_slice(
339        &mut self,
340        slice: &[u8],
341        offset: usize,
342        length: usize,
343        idxs: &[IdxSize],
344    ) {
345        assert!(8 * slice.len() >= offset + length);
346
347        self.reserve(idxs.len());
348        unsafe {
349            for idx in idxs {
350                debug_assert!((*idx as usize) < length);
351                let idx_in_slice = offset + *idx as usize;
352                let bit = (*slice.get_unchecked(idx_in_slice / 8) >> (idx_in_slice % 8)) & 1;
353                self.push_unchecked(bit != 0);
354            }
355        }
356    }
357
358    pub fn opt_gather_extend_from_slice(
359        &mut self,
360        slice: &[u8],
361        offset: usize,
362        length: usize,
363        idxs: &[IdxSize],
364    ) {
365        assert!(8 * slice.len() >= offset + length);
366
367        self.reserve(idxs.len());
368        unsafe {
369            for idx in idxs {
370                if (*idx as usize) < length {
371                    let idx_in_slice = offset + *idx as usize;
372                    let bit = (*slice.get_unchecked(idx_in_slice / 8) >> (idx_in_slice % 8)) & 1;
373                    self.push_unchecked(bit != 0);
374                } else {
375                    self.push_unchecked(false);
376                }
377            }
378        }
379    }
380
381    /// # Safety
382    /// The indices must be in-bounds.
383    pub unsafe fn gather_extend_from_bitmap(&mut self, bitmap: &Bitmap, idxs: &[IdxSize]) {
384        let (slice, offset, length) = bitmap.as_slice();
385        self.gather_extend_from_slice(slice, offset, length, idxs);
386    }
387
388    pub fn opt_gather_extend_from_bitmap(&mut self, bitmap: &Bitmap, idxs: &[IdxSize]) {
389        let (slice, offset, length) = bitmap.as_slice();
390        self.opt_gather_extend_from_slice(slice, offset, length, idxs);
391    }
392
393    /// # Safety
394    /// The indices must be in-bounds.
395    pub unsafe fn gather_extend_from_opt_validity(
396        &mut self,
397        bitmap: Option<&Bitmap>,
398        idxs: &[IdxSize],
399        length: usize,
400    ) {
401        if let Some(bm) = bitmap {
402            let (slice, offset, sl_length) = bm.as_slice();
403            debug_assert_eq!(sl_length, length);
404            self.gather_extend_from_slice(slice, offset, length, idxs);
405        } else {
406            self.extend_constant(length, true);
407        }
408    }
409
410    pub fn opt_gather_extend_from_opt_validity(
411        &mut self,
412        bitmap: Option<&Bitmap>,
413        idxs: &[IdxSize],
414        length: usize,
415    ) {
416        if let Some(bm) = bitmap {
417            let (slice, offset, sl_length) = bm.as_slice();
418            debug_assert_eq!(sl_length, length);
419            self.opt_gather_extend_from_slice(slice, offset, sl_length, idxs);
420        } else {
421            unsafe {
422                self.reserve(idxs.len());
423                for idx in idxs {
424                    self.push_unchecked((*idx as usize) < length);
425                }
426            }
427        }
428    }
429
430    /// # Safety
431    /// May only be called once at the end.
432    unsafe fn finish(&mut self) {
433        if !self.bit_len.is_multiple_of(64) {
434            self.bytes.extend_from_slice(&self.buf.to_le_bytes());
435            self.set_bits_in_bytes += self.buf.count_ones() as usize;
436            self.buf = 0;
437        }
438    }
439
440    /// Converts this BitmapBuilder into a mutable bitmap.
441    pub fn into_mut(mut self) -> MutableBitmap {
442        unsafe {
443            self.finish();
444            MutableBitmap::from_vec(self.bytes, self.bit_len)
445        }
446    }
447
448    /// The same as into_mut, but returns None if the bitmap is all-ones.
449    pub fn into_opt_mut_validity(mut self) -> Option<MutableBitmap> {
450        unsafe {
451            self.finish();
452            if self.set_bits_in_bytes == self.bit_len {
453                return None;
454            }
455            Some(MutableBitmap::from_vec(self.bytes, self.bit_len))
456        }
457    }
458
459    /// Freezes this BitmapBuilder into an immutable Bitmap.
460    pub fn freeze(mut self) -> Bitmap {
461        unsafe {
462            self.finish();
463            let storage = SharedStorage::from_vec(self.bytes);
464            Bitmap::from_inner_unchecked(
465                storage,
466                0,
467                self.bit_len,
468                Some(self.bit_len - self.set_bits_in_bytes),
469            )
470        }
471    }
472
473    /// The same as freeze, but returns None if the bitmap is all-ones.
474    pub fn into_opt_validity(mut self) -> Option<Bitmap> {
475        unsafe {
476            self.finish();
477            if self.set_bits_in_bytes == self.bit_len {
478                return None;
479            }
480            let storage = SharedStorage::from_vec(self.bytes);
481            let bitmap = Bitmap::from_inner_unchecked(
482                storage,
483                0,
484                self.bit_len,
485                Some(self.bit_len - self.set_bits_in_bytes),
486            );
487            Some(bitmap)
488        }
489    }
490
491    pub fn extend_trusted_len_iter<I>(&mut self, iterator: I)
492    where
493        I: Iterator<Item = bool> + TrustedLen,
494    {
495        self.reserve(iterator.size_hint().1.unwrap());
496        for b in iterator {
497            // SAFETY: we reserved and the iterator's length is trusted.
498            unsafe {
499                self.push_unchecked(b);
500            }
501        }
502    }
503
504    #[inline]
505    pub fn from_trusted_len_iter<I>(iterator: I) -> Self
506    where
507        I: Iterator<Item = bool> + TrustedLen,
508    {
509        let mut builder = Self::new();
510        builder.extend_trusted_len_iter(iterator);
511        builder
512    }
513}
514
515/// A wrapper for BitmapBuilder that does not allocate until the first false is
516/// pushed. Less efficient if you know there are false values because it must
517/// check if it has allocated for each push.
518pub enum OptBitmapBuilder {
519    AllTrue { bit_len: usize, bit_cap: usize },
520    MayHaveFalse(BitmapBuilder),
521}
522
523impl Default for OptBitmapBuilder {
524    fn default() -> Self {
525        Self::AllTrue {
526            bit_len: 0,
527            bit_cap: 0,
528        }
529    }
530}
531
532impl OptBitmapBuilder {
533    pub fn reserve(&mut self, additional: usize) {
534        match self {
535            Self::AllTrue { bit_len, bit_cap } => {
536                *bit_cap = usize::max(*bit_cap, *bit_len + additional);
537            },
538            Self::MayHaveFalse(inner) => inner.reserve(additional),
539        }
540    }
541
542    pub fn extend_constant(&mut self, length: usize, value: bool) {
543        match self {
544            Self::AllTrue { bit_len, bit_cap } => {
545                if value {
546                    *bit_cap = usize::max(*bit_cap, *bit_len + length);
547                    *bit_len += length;
548                } else {
549                    self.get_builder().extend_constant(length, value);
550                }
551            },
552            Self::MayHaveFalse(inner) => inner.extend_constant(length, value),
553        }
554    }
555
556    pub fn into_opt_validity(self) -> Option<Bitmap> {
557        match self {
558            Self::AllTrue { .. } => None,
559            Self::MayHaveFalse(inner) => inner.into_opt_validity(),
560        }
561    }
562
563    pub fn subslice_extend_from_opt_validity(
564        &mut self,
565        bitmap: Option<&Bitmap>,
566        start: usize,
567        length: usize,
568    ) {
569        match bitmap {
570            Some(bm) => {
571                self.get_builder()
572                    .subslice_extend_from_bitmap(bm, start, length);
573            },
574            None => {
575                self.extend_constant(length, true);
576            },
577        }
578    }
579
580    pub fn subslice_extend_each_repeated_from_opt_validity(
581        &mut self,
582        bitmap: Option<&Bitmap>,
583        start: usize,
584        length: usize,
585        repeats: usize,
586    ) {
587        match bitmap {
588            Some(bm) => {
589                self.get_builder()
590                    .subslice_extend_each_repeated_from_bitmap(bm, start, length, repeats);
591            },
592            None => {
593                self.extend_constant(length * repeats, true);
594            },
595        }
596    }
597
598    /// # Safety
599    /// The indices must be in-bounds.
600    pub unsafe fn gather_extend_from_opt_validity(
601        &mut self,
602        bitmap: Option<&Bitmap>,
603        idxs: &[IdxSize],
604    ) {
605        match bitmap {
606            Some(bm) => {
607                self.get_builder().gather_extend_from_bitmap(bm, idxs);
608            },
609            None => {
610                self.extend_constant(idxs.len(), true);
611            },
612        }
613    }
614
615    pub fn opt_gather_extend_from_opt_validity(
616        &mut self,
617        bitmap: Option<&Bitmap>,
618        idxs: &[IdxSize],
619        length: usize,
620    ) {
621        match bitmap {
622            Some(bm) => {
623                self.get_builder().opt_gather_extend_from_bitmap(bm, idxs);
624            },
625            None => {
626                if let Some(first_oob) = idxs.iter().position(|idx| *idx as usize >= length) {
627                    let builder = self.get_builder();
628                    builder.extend_constant(first_oob, true);
629                    for idx in idxs.iter().skip(first_oob) {
630                        builder.push((*idx as usize) < length);
631                    }
632                } else {
633                    self.extend_constant(idxs.len(), true);
634                }
635            },
636        }
637    }
638
639    fn get_builder(&mut self) -> &mut BitmapBuilder {
640        match self {
641            Self::AllTrue { bit_len, bit_cap } => {
642                let mut builder = BitmapBuilder::with_capacity(*bit_cap);
643                builder.extend_constant(*bit_len, true);
644                *self = Self::MayHaveFalse(builder);
645                let Self::MayHaveFalse(inner) = self else {
646                    unreachable!()
647                };
648                inner
649            },
650            Self::MayHaveFalse(inner) => inner,
651        }
652    }
653}