Skip to main content

vortex_buffer/bit/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Packed bitmaps that can be used to store boolean values.
5//!
6//! This module provides a wrapper on top of the `Buffer` type to store mutable and immutable
7//! bitsets. The bitsets are stored in little-endian order, meaning that the least significant bit
8//! of the first byte is the first bit in the bitset.
9#[cfg(feature = "arrow")]
10mod arrow;
11mod buf;
12mod buf_mut;
13mod count_ones;
14mod macros;
15mod meta;
16mod ops;
17mod select;
18mod view;
19
20pub use arrow_buffer::bit_chunk_iterator::BitChunkIterator;
21pub use arrow_buffer::bit_chunk_iterator::BitChunks;
22pub use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk;
23pub use arrow_buffer::bit_chunk_iterator::UnalignedBitChunkIterator;
24pub use arrow_buffer::bit_iterator::BitIndexIterator;
25pub use arrow_buffer::bit_iterator::BitIterator;
26pub use arrow_buffer::bit_iterator::BitSliceIterator;
27pub use buf::*;
28pub use buf_mut::*;
29pub use meta::*;
30pub use view::*;
31
32/// Packs up to 64 boolean values into a little-endian `u64` word.
33#[inline]
34pub fn collect_bool_word<F>(len: usize, mut f: F) -> u64
35where
36    F: FnMut(usize) -> bool,
37{
38    assert!(len <= 64, "cannot pack {len} bits into a u64 word");
39
40    let mut packed = 0;
41    for bit_idx in 0..len {
42        packed |= (f(bit_idx) as u64) << bit_idx;
43    }
44    packed
45}
46
47/// Pack `len` boolean values returned by `f` into the prefix of `words`, LSB-first,
48/// 64 bits per `u64`. `words` must have capacity for at least `len.div_ceil(64)` entries.
49///
50/// Writes via `=` (not `|=`), so the destination need not be zero-initialised.
51#[inline]
52pub fn collect_bool_words<F>(words: &mut [u64], len: usize, mut f: F)
53where
54    F: FnMut(usize) -> bool,
55{
56    let num_words = len.div_ceil(64);
57    assert!(
58        words.len() >= num_words,
59        "words slice has {} entries, need at least {num_words}",
60        words.len(),
61    );
62
63    let full = len / 64;
64    let remainder = len % 64;
65
66    for word_idx in 0..full {
67        let offset = word_idx * 64;
68        words[word_idx] = collect_bool_word(64, |bit_idx| f(offset + bit_idx));
69    }
70
71    if remainder != 0 {
72        let offset = full * 64;
73        words[full] = collect_bool_word(remainder, |bit_idx| f(offset + bit_idx));
74    }
75}
76
77/// Read up to 8 bytes as a little-endian `u64`, zero-padding the high bytes when fewer than 8
78/// bytes are supplied.
79///
80/// This preserves Vortex's least-significant-bit-first bitmap numbering on little- and big-endian
81/// targets. For a full 8-byte slice it lowers to a single word load.
82#[inline]
83pub fn read_u64_le(bytes: &[u8]) -> u64 {
84    debug_assert!(bytes.len() <= 8);
85    let mut buf = [0u8; 8];
86    buf[..bytes.len()].copy_from_slice(bytes);
87    u64::from_le_bytes(buf)
88}
89
90/// Splice a packed word `w` (whose bits above the highest valid bit are zero) into
91/// `words` at the given bit position.
92///
93/// The destination word at `bit_offset / 64` is OR'd, preserving any bits below
94/// `bit_offset % 64`. When `w` has high bits that spill into the next word, those
95/// bits are *assigned* (not OR'd) — so callers must ensure that next slot is zero
96/// (e.g. via `BufferMut::zeroed`).
97///
98/// `words.len()` need only cover the slots `w` actually writes to: skipping the
99/// spillover when its bits are all zero means a tail that fits entirely in the
100/// leading word never touches `words[dest_word + 1]`.
101#[inline]
102pub fn splice_word_at_bit(words: &mut [u64], bit_offset: usize, word: u64) {
103    let dest_word = bit_offset / 64;
104    let bit_in_word = bit_offset % 64;
105    words[dest_word] |= word << bit_in_word;
106    if bit_in_word != 0 {
107        let high = word >> (64 - bit_in_word);
108        if high != 0 {
109            words[dest_word + 1] = high;
110        }
111    }
112}
113
114/// Pack `len` boolean values returned by `f` into `words` starting at bit position
115/// `bit_offset`, LSB-first.
116///
117/// Composes [`collect_bool_word`] (pack up to 64 bools into a u64) with
118/// [`splice_word_at_bit`] (merge the packed word into the destination via shift-OR).
119///
120/// `words` must have at least `(bit_offset + len).div_ceil(64)` entries; see
121/// [`splice_word_at_bit`] for zero-init requirements on words above the cursor.
122#[inline]
123pub fn pack_bools_into_words<F>(words: &mut [u64], bit_offset: usize, len: usize, mut f: F)
124where
125    F: FnMut(usize) -> bool,
126{
127    if len == 0 {
128        return;
129    }
130    let num_words = (bit_offset + len).div_ceil(64);
131    assert!(
132        words.len() >= num_words,
133        "words slice has {} entries, need at least {num_words}",
134        words.len(),
135    );
136
137    let mut done = 0;
138    while len - done >= 64 {
139        let word = collect_bool_word(64, |bit| f(done + bit));
140        splice_word_at_bit(words, bit_offset + done, word);
141        done += 64;
142    }
143    let tail = len - done;
144    if tail > 0 {
145        let word = collect_bool_word(tail, |bit| f(done + bit));
146        splice_word_at_bit(words, bit_offset + done, word);
147    }
148}
149
150/// Get the bit value at `index` out of `buf`.
151///
152/// # Panics
153///
154/// Panics if `index` is not between 0 and length of `buf * 8`.
155#[inline(always)]
156pub fn get_bit(buf: &[u8], index: usize) -> bool {
157    buf[index / 8] & (1 << (index % 8)) != 0
158}
159
160/// Get the bit value at `index` out of `buf` without bounds checking.
161///
162/// # Safety
163///
164/// `index` must be between 0 and length of `buf * 8`.
165#[inline(always)]
166pub unsafe fn get_bit_unchecked(buf: *const u8, index: usize) -> bool {
167    (unsafe { *buf.add(index / 8) } & (1 << (index % 8))) != 0
168}
169
170/// Set the bit value at `index` in `buf` without bounds checking.
171///
172/// # Safety
173///
174/// `index` must be between 0 and length of `buf * 8`.
175#[inline(always)]
176pub unsafe fn set_bit_unchecked(buf: *mut u8, index: usize) {
177    unsafe { *buf.add(index / 8) |= 1 << (index % 8) };
178}
179
180/// Unset the bit value at `index` in `buf` without bounds checking.
181///
182/// # Safety
183///
184/// `index` must be between 0 and length of `buf * 8`.
185#[inline(always)]
186pub unsafe fn unset_bit_unchecked(buf: *mut u8, index: usize) {
187    unsafe { *buf.add(index / 8) &= !(1 << (index % 8)) };
188}
189
190#[cfg(test)]
191mod tests {
192    use super::collect_bool_word;
193    use super::pack_bools_into_words;
194    use super::read_u64_le;
195
196    #[test]
197    fn collect_bool_word_packs_lsb_first() {
198        let word = collect_bool_word(5, |idx| idx.is_multiple_of(2));
199        assert_eq!(word, 0b10101);
200    }
201
202    #[test]
203    fn collect_bool_word_empty() {
204        assert_eq!(collect_bool_word(0, |_| true), 0);
205    }
206
207    #[test]
208    fn read_u64_le_zero_pads_tail() {
209        assert_eq!(read_u64_le(&[0x34, 0x12]), 0x1234);
210        assert_eq!(read_u64_le(&[0xff; 8]), u64::MAX);
211    }
212
213    #[test]
214    #[should_panic(expected = "cannot pack 65 bits into a u64 word")]
215    fn collect_bool_word_rejects_too_many_bits() {
216        let _ = collect_bool_word(65, |_| true);
217    }
218
219    fn pack(bit_offset: usize, len: usize, f: impl Fn(usize) -> bool) -> Vec<bool> {
220        let num_words = (bit_offset + len).div_ceil(64);
221        let mut words = vec![0u64; num_words];
222        pack_bools_into_words(&mut words, bit_offset, len, &f);
223        (0..bit_offset + len)
224            .map(|i| (words[i / 64] >> (i % 64)) & 1 == 1)
225            .collect()
226    }
227
228    #[test]
229    fn pack_bools_aligned_multi_word_with_tail() {
230        let bits = pack(0, 130, |i| i.is_multiple_of(3));
231        for i in 0..130 {
232            assert_eq!(bits[i], i.is_multiple_of(3), "bit {i}");
233        }
234    }
235
236    #[test]
237    fn pack_bools_unaligned_crossing_words() {
238        let bits = pack(40, 200, |i| i.is_multiple_of(7));
239        assert!(bits[..40].iter().all(|&b| !b));
240        for i in 0..200 {
241            assert_eq!(bits[40 + i], i.is_multiple_of(7), "bit {}", 40 + i);
242        }
243    }
244
245    #[test]
246    fn pack_bools_preserves_low_bits_of_leading_word() {
247        let mut words = vec![0u64; 2];
248        words[0] = 0b11111;
249        pack_bools_into_words(&mut words, 5, 70, |_| true);
250        for i in 0..5 {
251            assert_eq!((words[0] >> i) & 1, 1, "preserved bit {i}");
252        }
253        for i in 5..75 {
254            assert_eq!((words[i / 64] >> (i % 64)) & 1, 1, "extended bit {i}");
255        }
256    }
257}