1#![allow(unsafe_op_in_unsafe_fn)]
2use std::ops::Deref;
3use std::sync::LazyLock;
4use std::sync::atomic::{AtomicU64, Ordering};
5
6use either::Either;
7use polars_error::{PolarsResult, polars_bail};
8
9use super::utils::{self, BitChunk, BitChunks, BitmapIter, count_zeros, fmt, get_bit_unchecked};
10use super::{IntoIter, MutableBitmap, chunk_iter_to_vec, num_intersections_with};
11use crate::array::Splitable;
12use crate::bitmap::aligned::AlignedBitmapSlice;
13use crate::bitmap::iterator::{
14 FastU32BitmapIter, FastU56BitmapIter, FastU64BitmapIter, TrueIdxIter,
15};
16use crate::legacy::utils::FromTrustedLenIterator;
17use crate::storage::SharedStorage;
18use crate::trusted_len::TrustedLen;
19
20const UNKNOWN_BIT_COUNT: u64 = u64::MAX;
21
22#[derive(Default)]
55pub struct Bitmap {
56 storage: SharedStorage<u8>,
57 offset: usize,
60 length: usize,
61
62 unset_bit_count_cache: AtomicU64,
67}
68
69#[inline(always)]
70fn has_cached_unset_bit_count(ubcc: u64) -> bool {
71 ubcc >> 63 == 0
72}
73
74impl Clone for Bitmap {
75 fn clone(&self) -> Self {
76 Self {
77 storage: self.storage.clone(),
78 offset: self.offset,
79 length: self.length,
80 unset_bit_count_cache: AtomicU64::new(
81 self.unset_bit_count_cache.load(Ordering::Relaxed),
82 ),
83 }
84 }
85}
86
87impl std::fmt::Debug for Bitmap {
88 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89 let (bytes, offset, len) = self.as_slice();
90 fmt(bytes, offset, len, f)
91 }
92}
93
94pub(super) fn check(bytes: &[u8], offset: usize, length: usize) -> PolarsResult<()> {
95 if offset + length > bytes.len().saturating_mul(8) {
96 polars_bail!(InvalidOperation:
97 "The offset + length of the bitmap ({}) must be `<=` to the number of bytes times 8 ({})",
98 offset + length,
99 bytes.len().saturating_mul(8)
100 );
101 }
102 Ok(())
103}
104
105impl Bitmap {
106 #[inline]
108 pub fn new() -> Self {
109 Self::default()
110 }
111
112 #[inline]
116 pub fn try_new(bytes: Vec<u8>, length: usize) -> PolarsResult<Self> {
117 check(&bytes, 0, length)?;
118 Ok(Self {
119 storage: SharedStorage::from_vec(bytes),
120 length,
121 offset: 0,
122 unset_bit_count_cache: AtomicU64::new(if length == 0 { 0 } else { UNKNOWN_BIT_COUNT }),
123 })
124 }
125
126 #[inline]
128 pub fn len(&self) -> usize {
129 self.length
130 }
131
132 #[inline]
134 pub fn is_empty(&self) -> bool {
135 self.len() == 0
136 }
137
138 pub fn iter(&self) -> BitmapIter {
140 BitmapIter::new(&self.storage, self.offset, self.length)
141 }
142
143 pub fn chunks<T: BitChunk>(&self) -> BitChunks<T> {
147 BitChunks::new(&self.storage, self.offset, self.length)
148 }
149
150 pub fn fast_iter_u32(&self) -> FastU32BitmapIter<'_> {
153 FastU32BitmapIter::new(&self.storage, self.offset, self.length)
154 }
155
156 pub fn fast_iter_u56(&self) -> FastU56BitmapIter<'_> {
159 FastU56BitmapIter::new(&self.storage, self.offset, self.length)
160 }
161
162 pub fn fast_iter_u64(&self) -> FastU64BitmapIter<'_> {
165 FastU64BitmapIter::new(&self.storage, self.offset, self.length)
166 }
167
168 pub fn true_idx_iter(&self) -> TrueIdxIter<'_> {
170 TrueIdxIter::new(self.len(), Some(self))
171 }
172
173 pub fn aligned<T: BitChunk>(&self) -> AlignedBitmapSlice<'_, T> {
175 AlignedBitmapSlice::new(&self.storage, self.offset, self.length)
176 }
177
178 #[inline]
186 pub fn as_slice(&self) -> (&[u8], usize, usize) {
187 let start = self.offset / 8;
188 let len = (self.offset % 8 + self.length).saturating_add(7) / 8;
189 (
190 &self.storage[start..start + len],
191 self.offset % 8,
192 self.length,
193 )
194 }
195
196 #[inline]
200 pub fn set_bits(&self) -> usize {
201 self.length - self.unset_bits()
202 }
203
204 #[inline]
208 pub fn lazy_set_bits(&self) -> Option<usize> {
209 Some(self.length - self.lazy_unset_bits()?)
210 }
211
212 pub fn unset_bits(&self) -> usize {
221 self.lazy_unset_bits().unwrap_or_else(|| {
222 let zeros = count_zeros(&self.storage, self.offset, self.length);
223 self.unset_bit_count_cache
224 .store(zeros as u64, Ordering::Relaxed);
225 zeros
226 })
227 }
228
229 pub fn lazy_unset_bits(&self) -> Option<usize> {
233 let cache = self.unset_bit_count_cache.load(Ordering::Relaxed);
234 has_cached_unset_bit_count(cache).then_some(cache as usize)
235 }
236
237 pub unsafe fn update_bit_count(&mut self, bits_set: usize) {
243 assert!(bits_set <= self.length);
244 let zeros = self.length - bits_set;
245 self.unset_bit_count_cache
246 .store(zeros as u64, Ordering::Relaxed);
247 }
248
249 #[inline]
254 pub fn slice(&mut self, offset: usize, length: usize) {
255 assert!(offset + length <= self.length);
256 unsafe { self.slice_unchecked(offset, length) }
257 }
258
259 #[inline]
264 pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
265 if offset == 0 && length == self.length {
267 return;
268 }
269
270 let unset_bit_count_cache = self.unset_bit_count_cache.get_mut();
272 if *unset_bit_count_cache == 0 || *unset_bit_count_cache == self.length as u64 {
273 let new_count = if *unset_bit_count_cache > 0 {
274 length as u64
275 } else {
276 0
277 };
278 *unset_bit_count_cache = new_count;
279 self.offset += offset;
280 self.length = length;
281 return;
282 }
283
284 if has_cached_unset_bit_count(*unset_bit_count_cache) {
285 let small_portion = (self.length / 5).max(32);
289 if length + small_portion >= self.length {
290 let slice_end = self.offset + offset + length;
292 let head_count = count_zeros(&self.storage, self.offset, offset);
293 let tail_count =
294 count_zeros(&self.storage, slice_end, self.length - length - offset);
295 let new_count = *unset_bit_count_cache - head_count as u64 - tail_count as u64;
296 *unset_bit_count_cache = new_count;
297 } else {
298 *unset_bit_count_cache = UNKNOWN_BIT_COUNT;
299 }
300 }
301
302 self.offset += offset;
303 self.length = length;
304 }
305
306 #[inline]
311 #[must_use]
312 pub fn sliced(self, offset: usize, length: usize) -> Self {
313 assert!(offset + length <= self.length);
314 unsafe { self.sliced_unchecked(offset, length) }
315 }
316
317 #[inline]
322 #[must_use]
323 pub unsafe fn sliced_unchecked(mut self, offset: usize, length: usize) -> Self {
324 self.slice_unchecked(offset, length);
325 self
326 }
327
328 #[inline]
332 pub fn get_bit(&self, i: usize) -> bool {
333 assert!(i < self.len());
334 unsafe { self.get_bit_unchecked(i) }
335 }
336
337 #[inline]
342 pub unsafe fn get_bit_unchecked(&self, i: usize) -> bool {
343 debug_assert!(i < self.len());
344 get_bit_unchecked(&self.storage, self.offset + i)
345 }
346
347 pub(crate) fn as_ptr(&self) -> *const u8 {
350 self.storage.deref().as_ptr()
351 }
352
353 pub(crate) fn offset(&self) -> usize {
356 self.offset
357 }
358
359 pub fn into_mut(mut self) -> Either<Self, MutableBitmap> {
367 match self.storage.try_into_vec() {
368 Ok(v) => Either::Right(MutableBitmap::from_vec(v, self.length)),
369 Err(storage) => {
370 self.storage = storage;
371 Either::Left(self)
372 },
373 }
374 }
375
376 pub fn make_mut(self) -> MutableBitmap {
379 match self.into_mut() {
380 Either::Left(data) => {
381 if data.offset > 0 {
382 let chunks = data.chunks::<u64>();
384 let remainder = chunks.remainder();
385 let vec = chunk_iter_to_vec(chunks.chain(std::iter::once(remainder)));
386 MutableBitmap::from_vec(vec, data.length)
387 } else {
388 MutableBitmap::from_vec(data.storage.as_ref().to_vec(), data.length)
389 }
390 },
391 Either::Right(data) => data,
392 }
393 }
394
395 #[inline]
397 pub fn new_zeroed(length: usize) -> Self {
398 const GLOBAL_ZERO_SIZE: usize = 1024 * 1024;
401 static GLOBAL_ZEROES: LazyLock<SharedStorage<u8>> = LazyLock::new(|| {
402 let mut ss = SharedStorage::from_vec(vec![0; GLOBAL_ZERO_SIZE]);
403 ss.leak();
404 ss
405 });
406
407 let bytes_needed = length.div_ceil(8);
408 let storage = if bytes_needed <= GLOBAL_ZERO_SIZE {
409 GLOBAL_ZEROES.clone()
410 } else {
411 SharedStorage::from_vec(vec![0; bytes_needed])
412 };
413 Self {
414 storage,
415 offset: 0,
416 length,
417 unset_bit_count_cache: AtomicU64::new(length as u64),
418 }
419 }
420
421 #[inline]
423 pub fn new_with_value(value: bool, length: usize) -> Self {
424 if !value {
425 return Self::new_zeroed(length);
426 }
427
428 unsafe {
429 Bitmap::from_inner_unchecked(
430 SharedStorage::from_vec(vec![u8::MAX; length.saturating_add(7) / 8]),
431 0,
432 length,
433 Some(0),
434 )
435 }
436 }
437
438 #[inline]
440 pub fn null_count_range(&self, offset: usize, length: usize) -> usize {
441 count_zeros(&self.storage, self.offset + offset, length)
442 }
443
444 #[inline]
448 pub fn from_u8_slice<T: AsRef<[u8]>>(slice: T, length: usize) -> Self {
449 Bitmap::try_new(slice.as_ref().to_vec(), length).unwrap()
450 }
451
452 #[inline]
457 pub fn from_u8_vec(vec: Vec<u8>, length: usize) -> Self {
458 Bitmap::try_new(vec, length).unwrap()
459 }
460
461 #[inline]
463 pub fn get(&self, i: usize) -> Option<bool> {
464 if i < self.len() {
465 Some(unsafe { self.get_bit_unchecked(i) })
466 } else {
467 None
468 }
469 }
470
471 pub unsafe fn from_inner_unchecked(
477 storage: SharedStorage<u8>,
478 offset: usize,
479 length: usize,
480 unset_bits: Option<usize>,
481 ) -> Self {
482 debug_assert!(check(&storage[..], offset, length).is_ok());
483
484 let unset_bit_count_cache = if let Some(n) = unset_bits {
485 AtomicU64::new(n as u64)
486 } else {
487 AtomicU64::new(UNKNOWN_BIT_COUNT)
488 };
489 Self {
490 storage,
491 offset,
492 length,
493 unset_bit_count_cache,
494 }
495 }
496
497 pub fn intersects_with(&self, other: &Self) -> bool {
501 self.num_intersections_with(other) != 0
502 }
503
504 pub fn num_intersections_with(&self, other: &Self) -> usize {
506 num_intersections_with(self, other)
507 }
508
509 pub fn select(&self, truthy: &Self, falsy: &Self) -> Self {
515 super::bitmap_ops::select(self, truthy, falsy)
516 }
517
518 pub fn select_constant(&self, truthy: &Self, falsy: bool) -> Self {
524 super::bitmap_ops::select_constant(self, truthy, falsy)
525 }
526
527 pub fn num_edges(&self) -> usize {
529 super::bitmap_ops::num_edges(self)
530 }
531
532 pub fn leading_zeros(&self) -> usize {
534 utils::leading_zeros(&self.storage, self.offset, self.length)
535 }
536 pub fn leading_ones(&self) -> usize {
538 utils::leading_ones(&self.storage, self.offset, self.length)
539 }
540 pub fn trailing_zeros(&self) -> usize {
542 utils::trailing_zeros(&self.storage, self.offset, self.length)
543 }
544 pub fn trailing_ones(&mut self) -> usize {
546 utils::trailing_ones(&self.storage, self.offset, self.length)
547 }
548
549 pub fn take_leading_zeros(&mut self) -> usize {
552 if self
553 .lazy_unset_bits()
554 .is_some_and(|unset_bits| unset_bits == self.length)
555 {
556 let leading_zeros = self.length;
557 self.offset += self.length;
558 self.length = 0;
559 *self.unset_bit_count_cache.get_mut() = 0;
560 return leading_zeros;
561 }
562
563 let leading_zeros = self.leading_zeros();
564 self.offset += leading_zeros;
565 self.length -= leading_zeros;
566 if has_cached_unset_bit_count(*self.unset_bit_count_cache.get_mut()) {
567 *self.unset_bit_count_cache.get_mut() -= leading_zeros as u64;
568 }
569 leading_zeros
570 }
571 pub fn take_leading_ones(&mut self) -> usize {
574 if self
575 .lazy_unset_bits()
576 .is_some_and(|unset_bits| unset_bits == 0)
577 {
578 let leading_ones = self.length;
579 self.offset += self.length;
580 self.length = 0;
581 *self.unset_bit_count_cache.get_mut() = 0;
582 return leading_ones;
583 }
584
585 let leading_ones = self.leading_ones();
586 self.offset += leading_ones;
587 self.length -= leading_ones;
588 leading_ones
590 }
591 pub fn take_trailing_zeros(&mut self) -> usize {
594 if self
595 .lazy_unset_bits()
596 .is_some_and(|unset_bits| unset_bits == self.length)
597 {
598 let trailing_zeros = self.length;
599 self.length = 0;
600 *self.unset_bit_count_cache.get_mut() = 0;
601 return trailing_zeros;
602 }
603
604 let trailing_zeros = self.trailing_zeros();
605 self.length -= trailing_zeros;
606 if has_cached_unset_bit_count(*self.unset_bit_count_cache.get_mut()) {
607 *self.unset_bit_count_cache.get_mut() -= trailing_zeros as u64;
608 }
609 trailing_zeros
610 }
611 pub fn take_trailing_ones(&mut self) -> usize {
614 if self
615 .lazy_unset_bits()
616 .is_some_and(|unset_bits| unset_bits == 0)
617 {
618 let trailing_ones = self.length;
619 self.length = 0;
620 *self.unset_bit_count_cache.get_mut() = 0;
621 return trailing_ones;
622 }
623
624 let trailing_ones = self.trailing_ones();
625 self.length -= trailing_ones;
626 trailing_ones
628 }
629}
630
631impl<P: AsRef<[bool]>> From<P> for Bitmap {
632 fn from(slice: P) -> Self {
633 Self::from_trusted_len_iter(slice.as_ref().iter().copied())
634 }
635}
636
637impl FromIterator<bool> for Bitmap {
638 fn from_iter<I>(iter: I) -> Self
639 where
640 I: IntoIterator<Item = bool>,
641 {
642 MutableBitmap::from_iter(iter).into()
643 }
644}
645
646impl FromTrustedLenIterator<bool> for Bitmap {
647 fn from_iter_trusted_length<T: IntoIterator<Item = bool>>(iter: T) -> Self
648 where
649 T::IntoIter: TrustedLen,
650 {
651 MutableBitmap::from_trusted_len_iter(iter.into_iter()).into()
652 }
653}
654
655impl Bitmap {
656 #[inline]
661 pub unsafe fn from_trusted_len_iter_unchecked<I: Iterator<Item = bool>>(iterator: I) -> Self {
662 MutableBitmap::from_trusted_len_iter_unchecked(iterator).into()
663 }
664
665 #[inline]
667 pub fn from_trusted_len_iter<I: TrustedLen<Item = bool>>(iterator: I) -> Self {
668 MutableBitmap::from_trusted_len_iter(iterator).into()
669 }
670
671 #[inline]
673 pub fn try_from_trusted_len_iter<E, I: TrustedLen<Item = std::result::Result<bool, E>>>(
674 iterator: I,
675 ) -> std::result::Result<Self, E> {
676 Ok(MutableBitmap::try_from_trusted_len_iter(iterator)?.into())
677 }
678
679 #[inline]
684 pub unsafe fn try_from_trusted_len_iter_unchecked<
685 E,
686 I: Iterator<Item = std::result::Result<bool, E>>,
687 >(
688 iterator: I,
689 ) -> std::result::Result<Self, E> {
690 Ok(MutableBitmap::try_from_trusted_len_iter_unchecked(iterator)?.into())
691 }
692}
693
694impl<'a> IntoIterator for &'a Bitmap {
695 type Item = bool;
696 type IntoIter = BitmapIter<'a>;
697
698 fn into_iter(self) -> Self::IntoIter {
699 BitmapIter::<'a>::new(&self.storage, self.offset, self.length)
700 }
701}
702
703impl IntoIterator for Bitmap {
704 type Item = bool;
705 type IntoIter = IntoIter;
706
707 fn into_iter(self) -> Self::IntoIter {
708 IntoIter::new(self)
709 }
710}
711
712impl Splitable for Bitmap {
713 #[inline(always)]
714 fn check_bound(&self, offset: usize) -> bool {
715 offset <= self.len()
716 }
717
718 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
719 if offset == 0 {
720 return (Self::new(), self.clone());
721 }
722 if offset == self.len() {
723 return (self.clone(), Self::new());
724 }
725
726 let ubcc = self.unset_bit_count_cache.load(Ordering::Relaxed);
727
728 let lhs_length = offset;
729 let rhs_length = self.length - offset;
730
731 let mut lhs_ubcc = UNKNOWN_BIT_COUNT;
732 let mut rhs_ubcc = UNKNOWN_BIT_COUNT;
733
734 if has_cached_unset_bit_count(ubcc) {
735 if ubcc == 0 {
736 lhs_ubcc = 0;
737 rhs_ubcc = 0;
738 } else if ubcc == self.length as u64 {
739 lhs_ubcc = offset as u64;
740 rhs_ubcc = (self.length - offset) as u64;
741 } else {
742 let small_portion = (self.length / 4).max(32);
746
747 if lhs_length <= rhs_length {
748 if rhs_length + small_portion >= self.length {
749 let count = count_zeros(&self.storage, self.offset, lhs_length) as u64;
750 lhs_ubcc = count;
751 rhs_ubcc = ubcc - count;
752 }
753 } else if lhs_length + small_portion >= self.length {
754 let count = count_zeros(&self.storage, self.offset + offset, rhs_length) as u64;
755 lhs_ubcc = ubcc - count;
756 rhs_ubcc = count;
757 }
758 }
759 }
760
761 debug_assert!(lhs_ubcc == UNKNOWN_BIT_COUNT || lhs_ubcc <= ubcc);
762 debug_assert!(rhs_ubcc == UNKNOWN_BIT_COUNT || rhs_ubcc <= ubcc);
763
764 (
765 Self {
766 storage: self.storage.clone(),
767 offset: self.offset,
768 length: lhs_length,
769 unset_bit_count_cache: AtomicU64::new(lhs_ubcc),
770 },
771 Self {
772 storage: self.storage.clone(),
773 offset: self.offset + offset,
774 length: rhs_length,
775 unset_bit_count_cache: AtomicU64::new(rhs_ubcc),
776 },
777 )
778 }
779}