Skip to main content

polars_arrow/bitmap/
builder.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2use polars_buffer::SharedStorage;
3use polars_utils::IdxSize;
4use polars_utils::slice::load_padded_le_u64;
5
6use super::bitmask::BitMask;
7use crate::bitmap::{Bitmap, MutableBitmap};
8use crate::trusted_len::TrustedLen;
9
10/// Used to build bitmaps bool-by-bool in sequential order.
11#[derive(Default, Clone)]
12pub struct BitmapBuilder {
13    buf: u64,                 // A buffer containing the last self.bit_len % 64 bits.
14    bit_len: usize,           // Length in bits.
15    bit_cap: usize,           // Capacity in bits (always multiple of 64).
16    set_bits_in_bytes: usize, // The number of bits set in self.bytes, not including self.buf.
17    bytes: Vec<u8>,
18}
19
20impl BitmapBuilder {
21    pub fn new() -> Self {
22        Self::default()
23    }
24
25    #[inline(always)]
26    pub fn len(&self) -> usize {
27        self.bit_len
28    }
29
30    #[inline(always)]
31    pub fn is_empty(&self) -> bool {
32        self.bit_len == 0
33    }
34
35    #[inline(always)]
36    pub fn capacity(&self) -> usize {
37        self.bit_cap
38    }
39
40    #[inline(always)]
41    pub fn set_bits(&self) -> usize {
42        self.set_bits_in_bytes + self.buf.count_ones() as usize
43    }
44
45    #[inline(always)]
46    pub fn unset_bits(&self) -> usize {
47        self.bit_len - self.set_bits()
48    }
49
50    pub fn with_capacity(bits: usize) -> Self {
51        let bytes = Vec::with_capacity(bits.div_ceil(64) * 8);
52        let words_available = bytes.capacity() / 8;
53        Self {
54            buf: 0,
55            bit_len: 0,
56            bit_cap: words_available * 64,
57            set_bits_in_bytes: 0,
58            bytes,
59        }
60    }
61
62    #[inline(always)]
63    pub fn reserve(&mut self, additional: usize) {
64        if self.bit_len + additional > self.bit_cap {
65            self.reserve_slow(additional)
66        }
67    }
68
69    #[cold]
70    #[inline(never)]
71    fn reserve_slow(&mut self, additional: usize) {
72        let bytes_needed = (self.bit_len + additional).div_ceil(64) * 8;
73        self.bytes.reserve(bytes_needed - self.bytes.len());
74        let words_available = self.bytes.capacity() / 8;
75        self.bit_cap = words_available * 64;
76    }
77
78    pub fn clear(&mut self) {
79        self.buf = 0;
80        self.bit_len = 0;
81        self.set_bits_in_bytes = 0;
82        self.bytes.clear();
83    }
84
85    #[inline(always)]
86    pub fn push(&mut self, x: bool) {
87        self.reserve(1);
88        unsafe { self.push_unchecked(x) }
89    }
90
91    /// Does not update len/set_bits, simply writes to the output buffer.
92    /// # Safety
93    /// self.bytes.len() + 8 <= self.bytes.capacity() must hold.
94    #[inline(always)]
95    unsafe fn flush_word_unchecked(&mut self, w: u64) {
96        let cur_len = self.bytes.len();
97        let p = self.bytes.as_mut_ptr().add(cur_len).cast::<u64>();
98        p.write_unaligned(w.to_le());
99        self.bytes.set_len(cur_len + 8);
100    }
101
102    /// # Safety
103    /// self.len() < self.capacity() must hold.
104    #[inline(always)]
105    pub unsafe fn push_unchecked(&mut self, x: bool) {
106        debug_assert!(self.bit_len < self.bit_cap);
107        self.buf |= (x as u64) << (self.bit_len % 64);
108        self.bit_len += 1;
109        if self.bit_len.is_multiple_of(64) {
110            self.flush_word_unchecked(self.buf);
111            self.set_bits_in_bytes += self.buf.count_ones() as usize;
112            self.buf = 0;
113        }
114    }
115
116    #[inline(always)]
117    pub fn extend_constant(&mut self, length: usize, value: bool) {
118        // Fast path if the extension still fits in buf with room left to spare.
119        let bits_in_buf = self.bit_len % 64;
120        if bits_in_buf + length < 64 {
121            let bit_block = ((value as u64) << length) - (value as u64);
122            self.buf |= bit_block << bits_in_buf;
123            self.bit_len += length;
124        } else {
125            self.extend_constant_slow(length, value);
126        }
127    }
128
129    #[cold]
130    fn extend_constant_slow(&mut self, length: usize, value: bool) {
131        unsafe {
132            let value_spread = if value { u64::MAX } else { 0 }; // Branchless neg.
133
134            // Extend and flush current buf.
135            self.reserve(length);
136            let bits_in_buf = self.bit_len % 64;
137            let ext_buf = self.buf | (value_spread << bits_in_buf);
138            self.flush_word_unchecked(ext_buf);
139            self.set_bits_in_bytes += ext_buf.count_ones() as usize;
140
141            // Write complete words.
142            let remaining_bits = length - (64 - bits_in_buf);
143            let remaining_words = remaining_bits / 64;
144            for _ in 0..remaining_words {
145                self.flush_word_unchecked(value_spread);
146            }
147            self.set_bits_in_bytes += (remaining_words * 64) & value_spread as usize;
148
149            // Put remainder in buf and update length.
150            self.buf = ((value as u64) << (remaining_bits % 64)) - (value as u64);
151            self.bit_len += length;
152        }
153    }
154
155    /// Pushes the first length bits from the given word, assuming the rest of
156    /// the bits are zero.
157    /// # Safety
158    /// self.len + length <= self.cap and length <= 64 must hold.
159    pub unsafe fn push_word_with_len_unchecked(&mut self, word: u64, length: usize) {
160        debug_assert!(self.bit_len + length <= self.bit_cap);
161        debug_assert!(length <= 64);
162        debug_assert!(length == 64 || (word >> length) == 0);
163        let bits_in_buf = self.bit_len % 64;
164        self.buf |= word << bits_in_buf;
165        if bits_in_buf + length >= 64 {
166            self.flush_word_unchecked(self.buf);
167            self.set_bits_in_bytes += self.buf.count_ones() as usize;
168            self.buf = if bits_in_buf > 0 {
169                word >> (64 - bits_in_buf)
170            } else {
171                0
172            };
173        }
174        self.bit_len += length;
175    }
176
177    /// # Safety
178    /// self.len() + length <= self.capacity() must hold, as well as
179    /// offset + length <= 8 * slice.len().
180    unsafe fn extend_from_slice_unchecked(
181        &mut self,
182        mut slice: &[u8],
183        mut offset: usize,
184        mut length: usize,
185    ) {
186        if length == 0 {
187            return;
188        }
189
190        // Deal with slice offset so it's aligned to bytes.
191        let slice_bit_offset = offset % 8;
192        if slice_bit_offset > 0 {
193            let bits_in_first_byte = (8 - slice_bit_offset).min(length);
194            let first_byte = *slice.get_unchecked(offset / 8) >> slice_bit_offset;
195            self.push_word_with_len_unchecked(
196                first_byte as u64 & ((1 << bits_in_first_byte) - 1),
197                bits_in_first_byte,
198            );
199            length -= bits_in_first_byte;
200            offset += bits_in_first_byte;
201        }
202        slice = slice.get_unchecked(offset / 8..);
203
204        // Write word-by-word.
205        let bits_in_buf = self.bit_len % 64;
206        if bits_in_buf > 0 {
207            while length >= 64 {
208                let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());
209                self.buf |= word << bits_in_buf;
210                self.flush_word_unchecked(self.buf);
211                self.set_bits_in_bytes += self.buf.count_ones() as usize;
212                self.buf = word >> (64 - bits_in_buf);
213                self.bit_len += 64;
214                length -= 64;
215                slice = slice.get_unchecked(8..);
216            }
217        } else {
218            while length >= 64 {
219                let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());
220                self.flush_word_unchecked(word);
221                self.set_bits_in_bytes += word.count_ones() as usize;
222                self.bit_len += 64;
223                length -= 64;
224                slice = slice.get_unchecked(8..);
225            }
226        }
227
228        // Just the last word left.
229        if length > 0 {
230            let word = load_padded_le_u64(slice);
231            self.push_word_with_len_unchecked(word & ((1 << length) - 1), length);
232        }
233    }
234
235    /// # Safety
236    /// self.len() + length*repeats <= self.capacity() must hold, as well as
237    /// offset + length <= 8 * slice.len().
238    unsafe fn extend_each_repeated_from_slice_unchecked(
239        &mut self,
240        slice: &[u8],
241        offset: usize,
242        length: usize,
243        repeats: usize,
244    ) {
245        debug_assert!(8 * slice.len() >= offset + length);
246        if repeats == 0 {
247            return;
248        }
249        if repeats == 1 {
250            return self.extend_from_slice_unchecked(slice, offset, length);
251        }
252        for bit_idx in offset..(offset + length) {
253            let bit = (*slice.get_unchecked(bit_idx / 8) >> (bit_idx % 8)) & 1 != 0;
254            self.extend_constant(repeats, bit);
255        }
256    }
257
258    pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) {
259        assert!(8 * slice.len() >= offset + length);
260        self.reserve(length);
261        unsafe {
262            self.extend_from_slice_unchecked(slice, offset, length);
263        }
264    }
265
266    pub fn extend_each_repeated_from_slice(
267        &mut self,
268        slice: &[u8],
269        offset: usize,
270        length: usize,
271        repeats: usize,
272    ) {
273        assert!(8 * slice.len() >= offset + length);
274        self.reserve(length * repeats);
275        unsafe {
276            self.extend_each_repeated_from_slice_unchecked(slice, offset, length, repeats);
277        }
278    }
279
280    pub fn extend_from_bitmap(&mut self, bitmap: &Bitmap) {
281        // TODO: we can perhaps use the bitmaps bitcount here instead of
282        // recomputing it if it has a known bitcount.
283        let (slice, offset, length) = bitmap.as_slice();
284        self.extend_from_slice(slice, offset, length);
285    }
286
287    pub fn extend_from_bitmask(&mut self, bitmap: BitMask<'_>) {
288        let (slice, offset, length) = bitmap.inner();
289        self.extend_from_slice(slice, offset, length);
290    }
291
292    /// Extends this BitmapBuilder with a subslice of a bitmap.
293    pub fn subslice_extend_from_bitmap(&mut self, bitmap: &Bitmap, start: usize, length: usize) {
294        let (slice, bm_offset, bm_length) = bitmap.as_slice();
295        assert!(start + length <= bm_length);
296        self.extend_from_slice(slice, bm_offset + start, length);
297    }
298
299    /// Extends this BitmapBuilder with a subslice of a bitmap, repeating each bit `repeats` times.
300    pub fn subslice_extend_each_repeated_from_bitmap(
301        &mut self,
302        bitmap: &Bitmap,
303        start: usize,
304        length: usize,
305        repeats: usize,
306    ) {
307        let (slice, bm_offset, bm_length) = bitmap.as_slice();
308        assert!(start + length <= bm_length);
309        self.extend_each_repeated_from_slice(slice, bm_offset + start, length, repeats);
310    }
311
312    pub fn subslice_extend_from_opt_validity(
313        &mut self,
314        bitmap: Option<&Bitmap>,
315        start: usize,
316        length: usize,
317    ) {
318        match bitmap {
319            Some(bm) => self.subslice_extend_from_bitmap(bm, start, length),
320            None => self.extend_constant(length, true),
321        }
322    }
323
324    pub fn subslice_extend_each_repeated_from_opt_validity(
325        &mut self,
326        bitmap: Option<&Bitmap>,
327        start: usize,
328        length: usize,
329        repeats: usize,
330    ) {
331        match bitmap {
332            Some(bm) => self.subslice_extend_each_repeated_from_bitmap(bm, start, length, repeats),
333            None => self.extend_constant(length * repeats, true),
334        }
335    }
336
337    /// # Safety
338    /// The indices must be in-bounds.
339    pub unsafe fn gather_extend_from_slice(
340        &mut self,
341        slice: &[u8],
342        offset: usize,
343        length: usize,
344        idxs: &[IdxSize],
345    ) {
346        assert!(8 * slice.len() >= offset + length);
347
348        self.reserve(idxs.len());
349        unsafe {
350            for idx in idxs {
351                debug_assert!((*idx as usize) < length);
352                let idx_in_slice = offset + *idx as usize;
353                let bit = (*slice.get_unchecked(idx_in_slice / 8) >> (idx_in_slice % 8)) & 1;
354                self.push_unchecked(bit != 0);
355            }
356        }
357    }
358
359    pub fn opt_gather_extend_from_slice(
360        &mut self,
361        slice: &[u8],
362        offset: usize,
363        length: usize,
364        idxs: &[IdxSize],
365    ) {
366        assert!(8 * slice.len() >= offset + length);
367
368        self.reserve(idxs.len());
369        unsafe {
370            for idx in idxs {
371                if (*idx as usize) < length {
372                    let idx_in_slice = offset + *idx as usize;
373                    let bit = (*slice.get_unchecked(idx_in_slice / 8) >> (idx_in_slice % 8)) & 1;
374                    self.push_unchecked(bit != 0);
375                } else {
376                    self.push_unchecked(false);
377                }
378            }
379        }
380    }
381
382    /// # Safety
383    /// The indices must be in-bounds.
384    pub unsafe fn gather_extend_from_bitmap(&mut self, bitmap: &Bitmap, idxs: &[IdxSize]) {
385        let (slice, offset, length) = bitmap.as_slice();
386        self.gather_extend_from_slice(slice, offset, length, idxs);
387    }
388
389    pub fn opt_gather_extend_from_bitmap(&mut self, bitmap: &Bitmap, idxs: &[IdxSize]) {
390        let (slice, offset, length) = bitmap.as_slice();
391        self.opt_gather_extend_from_slice(slice, offset, length, idxs);
392    }
393
394    /// # Safety
395    /// The indices must be in-bounds.
396    pub unsafe fn gather_extend_from_opt_validity(
397        &mut self,
398        bitmap: Option<&Bitmap>,
399        idxs: &[IdxSize],
400        length: usize,
401    ) {
402        if let Some(bm) = bitmap {
403            let (slice, offset, sl_length) = bm.as_slice();
404            debug_assert_eq!(sl_length, length);
405            self.gather_extend_from_slice(slice, offset, length, idxs);
406        } else {
407            self.extend_constant(length, true);
408        }
409    }
410
411    pub fn opt_gather_extend_from_opt_validity(
412        &mut self,
413        bitmap: Option<&Bitmap>,
414        idxs: &[IdxSize],
415        length: usize,
416    ) {
417        if let Some(bm) = bitmap {
418            let (slice, offset, sl_length) = bm.as_slice();
419            debug_assert_eq!(sl_length, length);
420            self.opt_gather_extend_from_slice(slice, offset, sl_length, idxs);
421        } else {
422            unsafe {
423                self.reserve(idxs.len());
424                for idx in idxs {
425                    self.push_unchecked((*idx as usize) < length);
426                }
427            }
428        }
429    }
430
431    /// # Safety
432    /// May only be called once at the end.
433    unsafe fn finish(&mut self) {
434        if !self.bit_len.is_multiple_of(64) {
435            self.bytes.extend_from_slice(&self.buf.to_le_bytes());
436            self.set_bits_in_bytes += self.buf.count_ones() as usize;
437            self.buf = 0;
438        }
439    }
440
441    /// Converts this BitmapBuilder into a mutable bitmap.
442    pub fn into_mut(mut self) -> MutableBitmap {
443        unsafe {
444            self.finish();
445            MutableBitmap::from_vec(self.bytes, self.bit_len)
446        }
447    }
448
449    /// The same as into_mut, but returns None if the bitmap is all-ones.
450    pub fn into_opt_mut_validity(mut self) -> Option<MutableBitmap> {
451        unsafe {
452            self.finish();
453            if self.set_bits_in_bytes == self.bit_len {
454                return None;
455            }
456            Some(MutableBitmap::from_vec(self.bytes, self.bit_len))
457        }
458    }
459
460    /// Freezes this BitmapBuilder into an immutable Bitmap.
461    pub fn freeze(mut self) -> Bitmap {
462        unsafe {
463            self.finish();
464            let storage = SharedStorage::from_vec(self.bytes);
465            Bitmap::from_inner_unchecked(
466                storage,
467                0,
468                self.bit_len,
469                Some(self.bit_len - self.set_bits_in_bytes),
470            )
471        }
472    }
473
474    /// The same as freeze, but returns None if the bitmap is all-ones.
475    pub fn into_opt_validity(mut self) -> Option<Bitmap> {
476        unsafe {
477            self.finish();
478            if self.set_bits_in_bytes == self.bit_len {
479                return None;
480            }
481            let storage = SharedStorage::from_vec(self.bytes);
482            let bitmap = Bitmap::from_inner_unchecked(
483                storage,
484                0,
485                self.bit_len,
486                Some(self.bit_len - self.set_bits_in_bytes),
487            );
488            Some(bitmap)
489        }
490    }
491
492    pub fn extend_trusted_len_iter<I>(&mut self, iterator: I)
493    where
494        I: Iterator<Item = bool> + TrustedLen,
495    {
496        self.reserve(iterator.size_hint().1.unwrap());
497        for b in iterator {
498            // SAFETY: we reserved and the iterator's length is trusted.
499            unsafe {
500                self.push_unchecked(b);
501            }
502        }
503    }
504
505    #[inline]
506    pub fn from_trusted_len_iter<I>(iterator: I) -> Self
507    where
508        I: Iterator<Item = bool> + TrustedLen,
509    {
510        let mut builder = Self::new();
511        builder.extend_trusted_len_iter(iterator);
512        builder
513    }
514}
515
516/// A wrapper for BitmapBuilder that does not allocate until the first false is
517/// pushed. Less efficient if you know there are false values because it must
518/// check if it has allocated for each push.
519pub enum OptBitmapBuilder {
520    AllTrue { bit_len: usize, bit_cap: usize },
521    MayHaveFalse(BitmapBuilder),
522}
523
524impl Default for OptBitmapBuilder {
525    fn default() -> Self {
526        Self::AllTrue {
527            bit_len: 0,
528            bit_cap: 0,
529        }
530    }
531}
532
533impl OptBitmapBuilder {
534    pub fn reserve(&mut self, additional: usize) {
535        match self {
536            Self::AllTrue { bit_len, bit_cap } => {
537                *bit_cap = usize::max(*bit_cap, *bit_len + additional);
538            },
539            Self::MayHaveFalse(inner) => inner.reserve(additional),
540        }
541    }
542
543    pub fn extend_constant(&mut self, length: usize, value: bool) {
544        match self {
545            Self::AllTrue { bit_len, bit_cap } => {
546                if value {
547                    *bit_cap = usize::max(*bit_cap, *bit_len + length);
548                    *bit_len += length;
549                } else {
550                    self.get_builder().extend_constant(length, value);
551                }
552            },
553            Self::MayHaveFalse(inner) => inner.extend_constant(length, value),
554        }
555    }
556
557    pub fn into_opt_validity(self) -> Option<Bitmap> {
558        match self {
559            Self::AllTrue { .. } => None,
560            Self::MayHaveFalse(inner) => inner.into_opt_validity(),
561        }
562    }
563
564    pub fn subslice_extend_from_opt_validity(
565        &mut self,
566        bitmap: Option<&Bitmap>,
567        start: usize,
568        length: usize,
569    ) {
570        match bitmap {
571            Some(bm) => {
572                self.get_builder()
573                    .subslice_extend_from_bitmap(bm, start, length);
574            },
575            None => {
576                self.extend_constant(length, true);
577            },
578        }
579    }
580
581    pub fn subslice_extend_each_repeated_from_opt_validity(
582        &mut self,
583        bitmap: Option<&Bitmap>,
584        start: usize,
585        length: usize,
586        repeats: usize,
587    ) {
588        match bitmap {
589            Some(bm) => {
590                self.get_builder()
591                    .subslice_extend_each_repeated_from_bitmap(bm, start, length, repeats);
592            },
593            None => {
594                self.extend_constant(length * repeats, true);
595            },
596        }
597    }
598
599    /// # Safety
600    /// The indices must be in-bounds.
601    pub unsafe fn gather_extend_from_opt_validity(
602        &mut self,
603        bitmap: Option<&Bitmap>,
604        idxs: &[IdxSize],
605    ) {
606        match bitmap {
607            Some(bm) => {
608                self.get_builder().gather_extend_from_bitmap(bm, idxs);
609            },
610            None => {
611                self.extend_constant(idxs.len(), true);
612            },
613        }
614    }
615
616    pub fn opt_gather_extend_from_opt_validity(
617        &mut self,
618        bitmap: Option<&Bitmap>,
619        idxs: &[IdxSize],
620        length: usize,
621    ) {
622        match bitmap {
623            Some(bm) => {
624                self.get_builder().opt_gather_extend_from_bitmap(bm, idxs);
625            },
626            None => {
627                if let Some(first_oob) = idxs.iter().position(|idx| *idx as usize >= length) {
628                    let builder = self.get_builder();
629                    builder.extend_constant(first_oob, true);
630                    for idx in idxs.iter().skip(first_oob) {
631                        builder.push((*idx as usize) < length);
632                    }
633                } else {
634                    self.extend_constant(idxs.len(), true);
635                }
636            },
637        }
638    }
639
640    fn get_builder(&mut self) -> &mut BitmapBuilder {
641        match self {
642            Self::AllTrue { bit_len, bit_cap } => {
643                let mut builder = BitmapBuilder::with_capacity(*bit_cap);
644                builder.extend_constant(*bit_len, true);
645                *self = Self::MayHaveFalse(builder);
646                let Self::MayHaveFalse(inner) = self else {
647                    unreachable!()
648                };
649                inner
650            },
651            Self::MayHaveFalse(inner) => inner,
652        }
653    }
654}