#![allow(unsafe_op_in_unsafe_fn)]
use std::ops::Deref;
use either::Either;
use polars_buffer::{Buffer, SharedStorage};
use polars_error::{PolarsResult, polars_bail};
use polars_utils::relaxed_cell::RelaxedCell;
use super::utils::{self, BitChunk, BitChunks, BitmapIter, count_zeros, fmt, get_bit_unchecked};
use super::{IntoIter, MutableBitmap, chunk_iter_to_vec, num_intersections_with};
use crate::array::Splitable;
use crate::bitmap::aligned::AlignedBitmapSlice;
use crate::bitmap::iterator::{
FastU32BitmapIter, FastU56BitmapIter, FastU64BitmapIter, TrueIdxIter,
};
use crate::bitmap::utils::bytes_for;
use crate::legacy::utils::FromTrustedLenIterator;
use crate::trusted_len::TrustedLen;
const UNKNOWN_BIT_COUNT: u64 = u64::MAX;
#[derive(Default, Clone)]
pub struct Bitmap {
storage: SharedStorage<u8>,
offset: usize,
length: usize,
unset_bit_count_cache: RelaxedCell<u64>,
}
#[inline(always)]
fn has_cached_unset_bit_count(ubcc: u64) -> bool {
ubcc >> 63 == 0
}
impl std::fmt::Debug for Bitmap {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let (bytes, offset, len) = self.as_slice();
fmt(bytes, offset, len, f)
}
}
pub(super) fn check(bytes: &[u8], offset: usize, length: usize) -> PolarsResult<()> {
if offset + length > bytes.len().saturating_mul(8) {
polars_bail!(InvalidOperation:
"The offset + length of the bitmap ({}) must be `<=` to the number of bytes times 8 ({})",
offset + length,
bytes.len().saturating_mul(8)
);
}
Ok(())
}
impl Bitmap {
#[inline]
pub fn new() -> Self {
Self::default()
}
#[inline]
pub fn try_new(bytes: Vec<u8>, length: usize) -> PolarsResult<Self> {
check(&bytes, 0, length)?;
Ok(Self {
storage: SharedStorage::from_vec(bytes),
length,
offset: 0,
unset_bit_count_cache: RelaxedCell::from(if length == 0 {
0
} else {
UNKNOWN_BIT_COUNT
}),
})
}
#[inline]
pub fn len(&self) -> usize {
self.length
}
#[inline]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn iter(&self) -> BitmapIter<'_> {
BitmapIter::new(&self.storage, self.offset, self.length)
}
pub fn chunks<T: BitChunk>(&self) -> BitChunks<'_, T> {
BitChunks::new(&self.storage, self.offset, self.length)
}
pub fn fast_iter_u32(&self) -> FastU32BitmapIter<'_> {
FastU32BitmapIter::new(&self.storage, self.offset, self.length)
}
pub fn fast_iter_u56(&self) -> FastU56BitmapIter<'_> {
FastU56BitmapIter::new(&self.storage, self.offset, self.length)
}
pub fn fast_iter_u64(&self) -> FastU64BitmapIter<'_> {
FastU64BitmapIter::new(&self.storage, self.offset, self.length)
}
pub fn true_idx_iter(&self) -> TrueIdxIter<'_> {
TrueIdxIter::new(self.len(), Some(self))
}
pub fn aligned<T: BitChunk>(&self) -> AlignedBitmapSlice<'_, T> {
AlignedBitmapSlice::new(&self.storage, self.offset, self.length)
}
#[inline]
pub fn as_slice(&self) -> (&[u8], usize, usize) {
let start = self.offset / 8;
let len = (self.offset % 8 + self.length).saturating_add(7) / 8;
(
&self.storage[start..start + len],
self.offset % 8,
self.length,
)
}
#[inline]
pub fn set_bits(&self) -> usize {
self.length - self.unset_bits()
}
#[inline]
pub fn lazy_set_bits(&self) -> Option<usize> {
Some(self.length - self.lazy_unset_bits()?)
}
pub fn unset_bits(&self) -> usize {
self.lazy_unset_bits().unwrap_or_else(|| {
let zeros = count_zeros(&self.storage, self.offset, self.length);
self.unset_bit_count_cache.store(zeros as u64);
zeros
})
}
pub fn lazy_unset_bits(&self) -> Option<usize> {
let cache = self.unset_bit_count_cache.load();
has_cached_unset_bit_count(cache).then_some(cache as usize)
}
pub unsafe fn update_bit_count(&mut self, bits_set: usize) {
assert!(bits_set <= self.length);
let zeros = self.length - bits_set;
self.unset_bit_count_cache.store(zeros as u64);
}
#[inline]
pub fn slice(&mut self, offset: usize, length: usize) {
assert!(offset + length <= self.length);
unsafe { self.slice_unchecked(offset, length) }
}
#[inline]
pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
if offset == 0 && length == self.length {
return;
}
let unset_bit_count_cache = self.unset_bit_count_cache.get_mut();
if *unset_bit_count_cache == 0 || *unset_bit_count_cache == self.length as u64 {
let new_count = if *unset_bit_count_cache > 0 {
length as u64
} else {
0
};
*unset_bit_count_cache = new_count;
self.offset += offset;
self.length = length;
return;
}
if has_cached_unset_bit_count(*unset_bit_count_cache) {
let small_portion = (self.length / 5).max(32);
if length + small_portion >= self.length {
let slice_end = self.offset + offset + length;
let head_count = count_zeros(&self.storage, self.offset, offset);
let tail_count =
count_zeros(&self.storage, slice_end, self.length - length - offset);
let new_count = *unset_bit_count_cache - head_count as u64 - tail_count as u64;
*unset_bit_count_cache = new_count;
} else {
*unset_bit_count_cache = UNKNOWN_BIT_COUNT;
}
}
self.offset += offset;
self.length = length;
}
#[inline]
#[must_use]
pub fn sliced(self, offset: usize, length: usize) -> Self {
assert!(offset + length <= self.length);
unsafe { self.sliced_unchecked(offset, length) }
}
#[inline]
#[must_use]
pub unsafe fn sliced_unchecked(mut self, offset: usize, length: usize) -> Self {
self.slice_unchecked(offset, length);
self
}
#[inline]
pub fn get_bit(&self, i: usize) -> bool {
assert!(i < self.len());
unsafe { self.get_bit_unchecked(i) }
}
#[inline]
pub unsafe fn get_bit_unchecked(&self, i: usize) -> bool {
debug_assert!(i < self.len());
get_bit_unchecked(&self.storage, self.offset + i)
}
pub(crate) fn as_ptr(&self) -> *const u8 {
self.storage.deref().as_ptr()
}
pub(crate) fn offset(&self) -> usize {
self.offset
}
pub fn into_mut(mut self) -> Either<Self, MutableBitmap> {
match self.storage.try_into_vec() {
Ok(v) => Either::Right(MutableBitmap::from_vec(v, self.length)),
Err(storage) => {
self.storage = storage;
Either::Left(self)
},
}
}
pub fn make_mut(self) -> MutableBitmap {
match self.into_mut() {
Either::Left(data) => {
if data.offset > 0 {
let chunks = data.chunks::<u64>();
let remainder = chunks.remainder();
let vec = chunk_iter_to_vec(chunks.chain(std::iter::once(remainder)));
MutableBitmap::from_vec(vec, data.length)
} else {
let len = bytes_for(data.length);
MutableBitmap::from_vec(data.storage[0..len].to_vec(), data.length)
}
},
Either::Right(data) => data,
}
}
#[inline]
pub fn new_zeroed(length: usize) -> Self {
let bytes_needed = length.div_ceil(8);
let storage = Buffer::zeroed(bytes_needed).into_storage();
Self {
storage,
offset: 0,
length,
unset_bit_count_cache: RelaxedCell::from(length as u64),
}
}
#[inline]
pub fn new_with_value(value: bool, length: usize) -> Self {
if !value {
return Self::new_zeroed(length);
}
unsafe {
Bitmap::from_inner_unchecked(
SharedStorage::from_vec(vec![u8::MAX; length.saturating_add(7) / 8]),
0,
length,
Some(0),
)
}
}
#[inline]
pub fn null_count_range(&self, offset: usize, length: usize) -> usize {
count_zeros(&self.storage, self.offset + offset, length)
}
#[inline]
pub fn from_u8_slice<T: AsRef<[u8]>>(slice: T, length: usize) -> Self {
Bitmap::try_new(slice.as_ref().to_vec(), length).unwrap()
}
#[inline]
pub fn from_u8_vec(vec: Vec<u8>, length: usize) -> Self {
Bitmap::try_new(vec, length).unwrap()
}
#[inline]
pub fn get(&self, i: usize) -> Option<bool> {
if i < self.len() {
Some(unsafe { self.get_bit_unchecked(i) })
} else {
None
}
}
pub unsafe fn from_inner_unchecked(
storage: SharedStorage<u8>,
offset: usize,
length: usize,
unset_bits: Option<usize>,
) -> Self {
debug_assert!(check(&storage[..], offset, length).is_ok());
let unset_bit_count_cache = if let Some(n) = unset_bits {
RelaxedCell::from(n as u64)
} else {
RelaxedCell::from(UNKNOWN_BIT_COUNT)
};
Self {
storage,
offset,
length,
unset_bit_count_cache,
}
}
pub fn intersects_with(&self, other: &Self) -> bool {
self.num_intersections_with(other) != 0
}
pub fn num_intersections_with(&self, other: &Self) -> usize {
num_intersections_with(
super::bitmask::BitMask::from_bitmap(self),
super::bitmask::BitMask::from_bitmap(other),
)
}
pub fn select(&self, truthy: &Self, falsy: &Self) -> Self {
super::bitmap_ops::select(self, truthy, falsy)
}
pub fn select_constant(&self, truthy: &Self, falsy: bool) -> Self {
super::bitmap_ops::select_constant(self, truthy, falsy)
}
pub fn num_edges(&self) -> usize {
super::bitmap_ops::num_edges(self)
}
pub fn leading_zeros(&self) -> usize {
utils::leading_zeros(&self.storage, self.offset, self.length)
}
pub fn leading_ones(&self) -> usize {
utils::leading_ones(&self.storage, self.offset, self.length)
}
pub fn trailing_zeros(&self) -> usize {
utils::trailing_zeros(&self.storage, self.offset, self.length)
}
pub fn trailing_ones(&self) -> usize {
utils::trailing_ones(&self.storage, self.offset, self.length)
}
pub fn take_leading_zeros(&mut self) -> usize {
if self
.lazy_unset_bits()
.is_some_and(|unset_bits| unset_bits == self.length)
{
let leading_zeros = self.length;
self.offset += self.length;
self.length = 0;
*self.unset_bit_count_cache.get_mut() = 0;
return leading_zeros;
}
let leading_zeros = self.leading_zeros();
self.offset += leading_zeros;
self.length -= leading_zeros;
if has_cached_unset_bit_count(*self.unset_bit_count_cache.get_mut()) {
*self.unset_bit_count_cache.get_mut() -= leading_zeros as u64;
}
leading_zeros
}
pub fn take_leading_ones(&mut self) -> usize {
if self
.lazy_unset_bits()
.is_some_and(|unset_bits| unset_bits == 0)
{
let leading_ones = self.length;
self.offset += self.length;
self.length = 0;
*self.unset_bit_count_cache.get_mut() = 0;
return leading_ones;
}
let leading_ones = self.leading_ones();
self.offset += leading_ones;
self.length -= leading_ones;
leading_ones
}
pub fn take_trailing_zeros(&mut self) -> usize {
if self
.lazy_unset_bits()
.is_some_and(|unset_bits| unset_bits == self.length)
{
let trailing_zeros = self.length;
self.length = 0;
*self.unset_bit_count_cache.get_mut() = 0;
return trailing_zeros;
}
let trailing_zeros = self.trailing_zeros();
self.length -= trailing_zeros;
if has_cached_unset_bit_count(*self.unset_bit_count_cache.get_mut()) {
*self.unset_bit_count_cache.get_mut() -= trailing_zeros as u64;
}
trailing_zeros
}
pub fn take_trailing_ones(&mut self) -> usize {
if self
.lazy_unset_bits()
.is_some_and(|unset_bits| unset_bits == 0)
{
let trailing_ones = self.length;
self.length = 0;
*self.unset_bit_count_cache.get_mut() = 0;
return trailing_ones;
}
let trailing_ones = self.trailing_ones();
self.length -= trailing_ones;
trailing_ones
}
}
impl<P: AsRef<[bool]>> From<P> for Bitmap {
fn from(slice: P) -> Self {
Self::from_trusted_len_iter(slice.as_ref().iter().copied())
}
}
impl FromIterator<bool> for Bitmap {
fn from_iter<I>(iter: I) -> Self
where
I: IntoIterator<Item = bool>,
{
MutableBitmap::from_iter(iter).into()
}
}
impl FromTrustedLenIterator<bool> for Bitmap {
fn from_iter_trusted_length<T: IntoIterator<Item = bool>>(iter: T) -> Self
where
T::IntoIter: TrustedLen,
{
MutableBitmap::from_trusted_len_iter(iter.into_iter()).into()
}
}
impl Bitmap {
#[inline]
pub unsafe fn from_trusted_len_iter_unchecked<I: Iterator<Item = bool>>(iterator: I) -> Self {
MutableBitmap::from_trusted_len_iter_unchecked(iterator).into()
}
#[inline]
pub fn from_trusted_len_iter<I: TrustedLen<Item = bool>>(iterator: I) -> Self {
MutableBitmap::from_trusted_len_iter(iterator).into()
}
#[inline]
pub fn try_from_trusted_len_iter<E, I: TrustedLen<Item = std::result::Result<bool, E>>>(
iterator: I,
) -> std::result::Result<Self, E> {
Ok(MutableBitmap::try_from_trusted_len_iter(iterator)?.into())
}
#[inline]
pub unsafe fn try_from_trusted_len_iter_unchecked<
E,
I: Iterator<Item = std::result::Result<bool, E>>,
>(
iterator: I,
) -> std::result::Result<Self, E> {
Ok(MutableBitmap::try_from_trusted_len_iter_unchecked(iterator)?.into())
}
}
impl<'a> IntoIterator for &'a Bitmap {
type Item = bool;
type IntoIter = BitmapIter<'a>;
fn into_iter(self) -> Self::IntoIter {
BitmapIter::<'a>::new(&self.storage, self.offset, self.length)
}
}
impl IntoIterator for Bitmap {
type Item = bool;
type IntoIter = IntoIter;
fn into_iter(self) -> Self::IntoIter {
IntoIter::new(self)
}
}
impl Splitable for Bitmap {
#[inline(always)]
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}
unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
if offset == 0 {
return (Self::new(), self.clone());
}
if offset == self.len() {
return (self.clone(), Self::new());
}
let ubcc = self.unset_bit_count_cache.load();
let lhs_length = offset;
let rhs_length = self.length - offset;
let mut lhs_ubcc = UNKNOWN_BIT_COUNT;
let mut rhs_ubcc = UNKNOWN_BIT_COUNT;
if has_cached_unset_bit_count(ubcc) {
if ubcc == 0 {
lhs_ubcc = 0;
rhs_ubcc = 0;
} else if ubcc == self.length as u64 {
lhs_ubcc = offset as u64;
rhs_ubcc = (self.length - offset) as u64;
} else {
let small_portion = (self.length / 4).max(32);
if lhs_length <= rhs_length {
if rhs_length + small_portion >= self.length {
let count = count_zeros(&self.storage, self.offset, lhs_length) as u64;
lhs_ubcc = count;
rhs_ubcc = ubcc - count;
}
} else if lhs_length + small_portion >= self.length {
let count = count_zeros(&self.storage, self.offset + offset, rhs_length) as u64;
lhs_ubcc = ubcc - count;
rhs_ubcc = count;
}
}
}
debug_assert!(lhs_ubcc == UNKNOWN_BIT_COUNT || lhs_ubcc <= ubcc);
debug_assert!(rhs_ubcc == UNKNOWN_BIT_COUNT || rhs_ubcc <= ubcc);
(
Self {
storage: self.storage.clone(),
offset: self.offset,
length: lhs_length,
unset_bit_count_cache: RelaxedCell::from(lhs_ubcc),
},
Self {
storage: self.storage.clone(),
offset: self.offset + offset,
length: rhs_length,
unset_bit_count_cache: RelaxedCell::from(rhs_ubcc),
},
)
}
}